X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/2ca993e82fb37b597a3c73ecd1586a139a6579c5..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/brkitr/rules/word_POSIX.txt diff --git a/icuSources/data/brkitr/rules/word_POSIX.txt b/icuSources/data/brkitr/rules/word_POSIX.txt index 23f1aea5..79126931 100644 --- a/icuSources/data/brkitr/rules/word_POSIX.txt +++ b/icuSources/data/brkitr/rules/word_POSIX.txt @@ -1,4 +1,6 @@ # +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html # Copyright (C) 2002-2016, International Business Machines Corporation # and others. All Rights Reserved. # @@ -6,8 +8,9 @@ # # ICU Word Break Rules, POSIX locale. # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0 -# with additions from L2/16-011R3 for Emoji sequences. +# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 +# with additions for Emoji Sequences from https://goo.gl/cluFCn +# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also. @@ -19,26 +22,20 @@ ############################################################################## !!chain; +!!quoted_literals_only; # # Character Class Definitions. # -# Apple $EmojiForMods becomes $E_Base here -$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; -# Apple $EmojiMods becomes $E_Modifier here, same chars -$E_Modifier = [\U0001F3FB-\U0001F3FF]; -$ZWJ = [\u200D]; -# Apple $EmojiForSeqs becomes $GAZ here (only emoji that follow a ZWJ) -$GAZ = [\u2640\u2642\u2764\U0001F308\U0001F466-\U0001F469\U0001F48B\U0001F5E8]; - $CR = [\p{Word_Break = CR}]; $LF = [\p{Word_Break = LF}]; $Newline = [\p{Word_Break = Newline} ]; -$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]]; +$Extend = [\p{Word_Break = Extend}]; +$ZWJ = [\p{Word_Break = ZWJ}]; $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; -$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]]; +$Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; @@ -49,6 +46,8 @@ $MidLetter = [\p{Word_Break = MidLetter} - [\:]]; $MidNum = [\p{Word_Break = MidNum} [.]]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [:ExtPict:]; $Han = [:Han:]; $Hiragana = [:Hiragana:]; @@ -93,17 +92,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; ## ------------------------------------------------- -!!forward; - - # Rule 3 - CR x LF # $CR $LF; -# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. +# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. # -$ZWJ $GAZ; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. +# +$WSegSpace $WSegSpace; # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning # of a region of Text. The rule here comes into play when the start of text @@ -120,9 +119,7 @@ $KatakanaEx {400}; # note: these status values override those from rule 5 $HiraganaEx {400}; # by virtue of being numerically larger. $IdeographicEx {400}; # -$E_Base ($Extend | $Format | $ZWJ)*; -$E_Modifier ($Extend | $Format | $ZWJ)*; -$GAZ ($Extend | $Format | $ZWJ)*; +$Extended_Pict ($Extend | $Format | $ZWJ)*; # # rule 5 @@ -174,7 +171,7 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) $ExtendNumLetEx $NumericEx {100}; # (13b) $ExtendNumLetEx $KatakanaEx {400}; # (13b) -# rule 13c +# rules 15 - 17 # Pairs of Regional Indicators stay together. # With rule chaining disabled by ^, this rule will match exactly two of them. # No other rule begins with a Regional_Indicator, so chaining cannot extend the match. @@ -185,141 +182,6 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b) $HangulSyllable $HangulSyllable {200}; $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found -# rule 13d -# E_Base x E_Modifier -# -($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier; - - -## ------------------------------------------------- - -!!reverse; - -$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter; -$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus; -$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote; -$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote; -$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet; -$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric; -$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum; -$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter; -$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana; -$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana; -$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet; -$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator; - -# rule 3 -$LF $CR; - -# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. -# -$GAZ $ZWJ; - -# rule 4 -($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?; - -# rule 5 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 6 and 7 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 7a -$BackSingle_QuoteEx $BackHebrew_LetterEx; - -# Rule 7b and 7c -$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; - -# rule 8 - -$BackNumericEx $BackNumericEx; - -# rule 9 - -$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 10 - -($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; - -# rule 11 and 12 - -$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx; - -# rule 13 - -$BackKatakanaEx $BackKatakanaEx; - -# rules 13 a/b -# -$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); -($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; - -# rule 13c - -^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - -$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable; -$KanaKanji $KanaKanji; #different rule status if both kanji and kana found - -# rule 13d - -$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ); - - - -## ------------------------------------------------- - -!!safe_reverse; - -# rule 3 -($Extend | $Format | $ZWJ)+ .?; - -# rule 6 -($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 7b -$Double_Quote $BackHebrew_LetterEx; - - -# rule 11 -($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; - -# rule 13c -$BackRegional_IndicatorEx*; - -# For dictionary-based break -$dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# rule 4 -($Extend | $Format | $ZWJ)+ .?; - -# rule 6 -($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); - -# rule 7b -$Double_QuoteEx $Hebrew_LetterEx; - -# rule 11 -($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; - -# rule 13c -$Regional_IndicatorEx*; - -# For dictionary-based break -$dictionary $dictionary; +# Rule 999 +# Match a single code point if no other rule applies. +.;