]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/test/testdata/regextst.txt
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / test / testdata / regextst.txt
index 5716ab54449aec6341470d75ef8fc2f98072de39..6873f4835c9d085a5cec663a42e82790fc6bcd40 100644 (file)
@@ -1,4 +1,6 @@
-# Copyright (c) 2001-2013 International Business Machines
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2001-2015 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #  file:
 "ab(?:(c)|(d))\1"            i "abde"
 "ab(?:(c)|(d))\1"            i "<0>ab<1>c</1>c</0>e"
 
+# Named back references
+"(?<one>abcd)\k<one>"          "<0><1>abcd</1>abcd</0>"
+"(no)?(?<one>abcd)\k<one>"     "<0><2>abcd</2>abcd</0>"
+
+"(?<a_1>...)"               E  "  "   # backref names are ascii letters & numbers only"
+"(?<1a>...)"                E  "  "   # backref names must begin with a letter"
+"(?<a>.)(?<a>.)"            E  "  "   # Repeated names are illegal.
+
+
 # Case Insensitive
 "aBc"                    i      "<0>ABC</0>"
 "a[^bc]d"                i      "ABD"
 '((((((((((a))))))))))\10' i    "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
 
 "(?:(?i)a)b"                    "<0>Ab</0>"
-"ab(?i)cd"                      "<0>abCd</0>"
+"ab(?i)cd"                         "<0>abCd</0>"
 "ab$cd"                         "abcd"
 
+"ssl"                      i    "abc<0>ßl</0>xyz"
+"ssl"                      i    "abc<0>ẞl</0>xyz"
+"FIND"                     i    "can <0>find</0> ?"  # fi ligature, \ufb01
+"find"                      i    "can <0>FIND</0> ?"
+"ῧ"                        i    "xxx<0>ῧ</0>xxx"    # Composed char (match string) decomposes when case-folded (pattern)
+
 # White space handling
 "a b"                           "ab"
 "abc "                          "abc"
 "abc\jkl"                         "<0>abcjkl</0>"    # escape of a non-special letter is just itself.
 "abc[ \j]kl"                      "<0>abcjkl</0>"
 
+#
+# \R  all newline sequences.
+#
+"abc\Rxyz"                        "<0>abc\u000axyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u000bxyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u000cxyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u000dxyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u0085xyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u2028xyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u2029xyz</0>gh"
+"abc\Rxyz"                        "<0>abc\u000d\u000axyz</0>gh"
+
+"abc\R\nxyz"                      "abc\u000d\u000axyzgh"          # \R cannot match only the CR from a CR/LF sequence.
+"abc\r\nxyz"                      "<0>abc\u000d\u000axyz</0>gh"
+
+"abc\Rxyz"                        "abc\u0009xyz"                  # Assorted non-matches.
+"abc\Rxyz"                        "abc\u000exyz"
+"abc\Rxyz"                        "abc\u202axyz"
+
+# \v \V single character new line sequences.
+
+"abc\vxyz"                        "<0>abc\u000axyz</0>gh"
+"abc\vxyz"                        "<0>abc\u000bxyz</0>gh"
+"abc\vxyz"                        "<0>abc\u000cxyz</0>gh"
+"abc\vxyz"                        "<0>abc\u000dxyz</0>gh"
+"abc\vxyz"                        "<0>abc\u0085xyz</0>gh"
+"abc\vxyz"                        "<0>abc\u2028xyz</0>gh"
+"abc\vxyz"                        "<0>abc\u2029xyz</0>gh"
+"abc\vxyz"                        "abc\u000d\u000axyzgh"
+"abc\vxyz"                        "abc?xyzgh"
+
+"abc[\v]xyz"                      "<0>abc\u000axyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u000bxyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u000cxyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u000dxyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u0085xyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u2028xyz</0>gh"
+"abc[\v]xyz"                      "<0>abc\u2029xyz</0>gh"
+"abc[\v]xyz"                      "abc\u000d\u000axyzgh"
+"abc[\v]xyz"                      "abc?xyzgh"
+
+"abc\Vxyz"                        "abc\u000axyzgh"
+"abc\Vxyz"                        "abc\u000bxyzgh"
+"abc\Vxyz"                        "abc\u000cxyzgh"
+"abc\Vxyz"                        "abc\u000dxyzgh"
+"abc\Vxyz"                        "abc\u0085xyzgh"
+"abc\Vxyz"                        "abc\u2028xyzgh"
+"abc\Vxyz"                        "abc\u2029xyzgh"
+"abc\Vxyz"                        "abc\u000d\u000axyzgh"
+"abc\Vxyz"                        "<0>abc?xyz</0>gh"
+
+# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab
+
+"abc\hxyz"                        "<0>abc xyz</0>gh"
+"abc\Hxyz"                        "abc xyzgh"
+"abc\hxyz"                        "<0>abc\u2003xyz</0>gh"
+"abc\Hxyz"                        "abc\u2003xyzgh"
+"abc\hxyz"                        "<0>abc\u0009xyz</0>gh"
+"abc\Hxyz"                        "abc\u0009xyzgh"
+"abc\hxyz"                        "abc?xyzgh"
+"abc\Hxyz"                        "<0>abc?xyz</0>gh"
+
+"abc[\h]xyz"                      "<0>abc xyz</0>gh"
+"abc[\H]xyz"                      "abc xyzgh"
+"abc[\h]xyz"                      "<0>abc\u2003xyz</0>gh"
+"abc[\H]xyz"                      "abc\u2003xyzgh"
+"abc[\h]xyz"                      "<0>abc\u0009xyz</0>gh"
+"abc[\H]xyz"                      "abc\u0009xyzgh"
+"abc[\h]xyz"                      "abc?xyzgh"
+"abc[\H]xyz"                      "<0>abc?xyz</0>gh"
+
+
 #
 # Bug xxxx
 #
 "(?<=a{1,})bc"           E       "aaaa<0>bc</0>def"   # U_REGEX_LOOK_BEHIND_LIMIT error.
 "(?<=(?:){11})bc"                "<0>bc</0>"          # Empty (?:) expression.
 
+# Bug 10835
+#   Match Start Set not being correctly computed for case insensitive patterns.
+#   (Test here is to dump the compiled pattern & manually check the start set.)
+
+"(private|secret|confidential|classified|restricted)"  i   "hmm, <0><1>Classified</1></0> stuff"
+"(private|secret|confidential|classified|restricted)"      "hmm, Classified stuff"
+
+# Bug 10844
+
+"^([\w\d:]+)$"                  "<0><1>DiesIst1Beispiel:text</1></0>"
+"^([\w\d:]+)$"           i      "<0><1>DiesIst1Beispiel:text</1></0>"
+"^(\w+\d\w+:\w+)$"              "<0><1>DiesIst1Beispiel:text</1></0>"
+"^(\w+\d\w+:\w+)$"       i      "<0><1>DiesIst1Beispiel:text</1></0>"
+
+# Bug 11049
+#   Edge cases in find() when pattern match begins with set of code points
+#   and the match begins at the end of the string.
+
+"A|B|C"                         "hello <0>A</0>"
+"A|B|C"                         "hello \U00011234"
+"A|B|\U00012345"                "hello <0>\U00012345</0>"
+"A|B|\U00010000"                "hello \ud800"
+
+# Bug 11369
+#   Incorrect optimization of patterns with a zero length quantifier {0}
+
+"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)"   "AAAAABBBBBCCCCCDDDDEEEEE"
+"(|b)ab(c)"                     "<0><1></1>ab<2>c</2></0>"
+"(|b){0}a{3}(D*)"               "<0>aaa<2></2></0>"
+"(|b){0,1}a{3}(D*)"             "<0><1></1>aaa<2></2></0>"
+"((|b){0})a{3}(D*)"             "<0><1></1>aaa<3></3></0>"
+
+# Bug 11370
+#   Max match length computation of look-behind expression gives result that is too big to fit in the
+#   in the 24 bit operand portion of the compiled code. Expressions should fail to compile
+#   (Look-behind match length must be bounded. This case is treated as unbounded, an error.)
+
+"(?<!(0123456789a){10000000})x"         E  "no match"
+"(?<!\\ubeaf(\\ubeaf{11000}){11000})"   E  "no match"
+
+# Bug 11374 Bad integer overflow check in number conversion.
+#           4294967300 converts to 4 with 32 bit overflow.
+
+"x{4294967300}"                         E  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+"x{0,4294967300}"                       E  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+
+# Bug 11373
+#
+#    Overflow checking in max match length computation for loops.
+#    Value here is 10 * 100000 * 3000 = 3E9, overflowing a 32 bit signed value.
+#    Before fixing, this case gave an assertion failure.
+
+"(?<=((0123456789){100000}){3000})abc"  E  "abc"
+
+# Bug 11507  Capture of an unpaired surrogate shouldn't allow a back reference to 
+#            match half of a surrogate pair, but only another unpaired surrogate.
+# 
+"pre(.)post\1"                  "pre\ud800post\ud800\udc00"
+"pre(.)post\1"                  "<0>pre<1>\ud800</1>post\ud800</0> fin"
+"pre(.)post\1"          i       "pre\ud800post\ud800\udc00"         # case insensiteve backrefs take a different code path
+"pre(.)post\1"          i       "<0>pre<1>\ud800</1>post\ud800</0> fin"
+
+# Bug 11554
+#
+#    Maximum match length computation was assuming UTF-16.
+#    Used in look-behind matches to constrain how far back to look.
+
+"(?<=a\x{100000})spam"          "***a\x{100000}<0>spam</0>**"
+"(?<=aą)spam"                   "**aą<0>spam</0>**"
+"(?<=ąabc)spam"                 "**ąabc<0>spam</0>**"
+
+"(?<=a\x{100000})spam"          "***a\x{100001}spam**"
+"(?<=aą)spam"                   "**bąspam**"
+"(?<=ąabc)spam"                 "**ąabxspam**"
+
+# with negative look-behind
+
+"(?<!a\x{100000})spam"          "***a\x{100000}spam**"
+"(?<!aą)spam"                   "**aąspam**"
+"(?<!ąabc)spam"                 "**ąabcspam**"
+
+"(?<!a\x{100000})spam"          "***a\x{100001}<0>spam</0>**"
+"(?<!aą)spam"                   "**bą<0>spam</0>**"
+"(?<!ąabc)spam"                 "**ąabx<0>spam</0>**"
+
+# Bug #12930
+#
+#   Minimum Match Length computation, int32_t overflow on an empty set in the pattern.
+#   The empty set, with no match possible, has a min match length of INT32_MAX.
+#   Was incremented subsequently. Caused assertion failure on pattern compile.
+
+"[^\u0000-\U0010ffff]bc?"       "bc no match"
+"[^\u0000-\U0010ffff]?bc?"      "<0>bc</0> has a match"
+
 
 #  Random debugging, Temporary
 #
-#"^(?:a?b?)*$"                   "a--"
 
 "This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"
-"((?:a|b|c)whoop-dee-do) | [jkl]|zed"             "x"
-"astring|another[bcd]|alpha|a|[a]"    "x"
 
 
 #