X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/249c4c5ea9376c24572daf9c2effa7484a282f14..3d1f044b704633e2e541231cd17ae9ecf9ad5c7a:/icuSources/test/testdata/break_rules/line.txt diff --git a/icuSources/test/testdata/break_rules/line.txt b/icuSources/test/testdata/break_rules/line.txt index bb29a2d1..b33994b2 100644 --- a/icuSources/test/testdata/break_rules/line.txt +++ b/icuSources/test/testdata/break_rules/line.txt @@ -19,13 +19,14 @@ locale = en; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [[:LineBreak = EB:] \U0001F46A-\U0001F46D\U0001F46F\U0001F91D\U0001F93C]; @@ -66,7 +67,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -77,23 +78,39 @@ LB5.3: NL ÷; LB6: . (BK | CR | LF | NL); LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); +# LB8 break after ZW SP*. +# Precedes LB7 because both rules will match the sequences like ZW SP, +# and LB8 must take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# Numbers. Equivalent to Tailoring example 8 from UAX 14. +# Moved up, before LB14, because it can match longer sequences which must take precedence. +LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; + # Rules LB14 - LB17. -# Moved before LB7, because they can match a longer sequence that would also match LB7, -# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, -# "while only the prefix "OP CM SP" matches LB7.1 +# Moved up, before LB7, because they can match a longer sequence that would also match LB7. +# For example, the sequence "OP CM SP AL" matches LB14 +# while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; + +# LB 14a Break before an IS that begins a number and follows a space. +LB14a: SP ÷ IS CM* NU; + +# LB14b × IS +LB14b.1: [^SP] CM* IS; +LB14b.2: SP IS; + LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -115,15 +132,9 @@ LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; -# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14. -# -# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. -# LB13.2 SP CM* [CL CP EX IS SY] - -LB13.1: [^NU SP] CM* [CL CP IS SY]; -LB13.2: [^SP] CM* EX; -LB13.2: SP [CL CP EX IS SY]; - +# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces. +LB13.1: [^SP] CM* [CL CP EX SY]; +LB13.2: SP [CL CP EX SY]; # LB 14-17 are moved above LB 7. @@ -142,6 +153,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; @@ -166,8 +180,6 @@ LB23a.2: (ID | EB | EM) CM* PO; LB24.2: (PR | PO) CM* (AL | HL); LB24.3: (AL | HL | CM) CM* (PR | PO); -# Numbers. Equivalent to Tailoring example 8 from UAX 14. -LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; LB26.1: JL CM* (JL | JV | H2 | H3); LB26.2: (JV | H2) CM* (JV | JT); @@ -188,8 +200,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier