ICU-66108.tar.gz

[apple/icu.git] / icuSources / test / testdata / break_rules / line.txt
diff --git a/icuSources/test/testdata/break_rules/line.txt b/icuSources/test/testdata/break_rules/line.txt

index b33994b2a7a2762a442f35eeed997c0482e5a8ae..543fefd6ebf2603cdf538dcff8365a5e52941c56 100644 (file)
--- a/icuSources/test/testdata/break_rules/line.txt
+++ b/icuSources/test/testdata/break_rules/line.txt
@@ -6,7 +6,14 @@
  # file: line.txt
  #
  # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
-# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0,
+# with the following modification:
+#
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
+#
+#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+#         It sets characters of class CJ to behave like NS.
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -61,6 +68,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
+# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
+# Limitations of this monkey test rule parser require that these definitions be pulled out
+# rather than appearing in-line in LB 30.
+
+OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
+CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
+
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
@@ -165,11 +179,7 @@ LB21.2:      BB CM* [^CM CB];
  
  LB21b:       SY CM* HL;
  
-LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
-LB22.2:       EX CM* IN;
-LB22.3:       (ID | EB | EM) CM* IN;
-LB22.4:       IN CM* IN;
-LB22.5:       NU CM* IN;
+LB22:        . CM* IN;
  
  LB23.1:      (AL | HL | CM) CM* NU;
  LB23.2:      NU CM* (AL | HL);
@@ -196,13 +206,13 @@ LB28:        (AL | HL | CM)CM* (AL | HL);
  LB29:        IS CM* (AL | HL);
  
  # LB30  is adjusted for unattached leading CM being treated as AL.
-LB30.1:      (AL | CM | HL | NU) CM* OP;
-LB30.2:      CP CM* (AL | HL | NU);
+LB30.1:      (AL | CM | HL | NU) CM* OP30;
+LB30.2:      CP30 CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;