X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..51004dcb01e06fef634b61be77ed73dd61cb6db9:/icuSources/test/testdata/regextst.txt

diff --git a/icuSources/test/testdata/regextst.txt b/icuSources/test/testdata/regextst.txt
index 3de486ac..53bd73a7 100644
--- a/icuSources/test/testdata/regextst.txt
+++ b/icuSources/test/testdata/regextst.txt
@@ -1,4 +1,4 @@
-ï»¿# Copyright (c) 2001-2010 International Business Machines
+ï»¿# Copyright (c) 2001-2012 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #  file:
@@ -13,8 +13,8 @@
 #               <tagged string> = text, with the start and end of each
 #                                 capture group tagged with <n>...</n>.  The overall match,
 #                                 if any, is group 0, as in <0>matched text</0>
-#
 #                                  A region can be specified with <r>...</r> tags.
+#                                 Standard ICU unescape will be applied, allowing \u, \U, etc. to appear.
 #
 #               <flags>         = any combination of
 #                                   i      case insensitive match
@@ -23,6 +23,7 @@
 #                                   m      multi-line mode.  
 #                                            ($ and ^ match at embedded new-lines)
 #                                   D      Unix Lines mode (only recognize 0x0a as new-line)
+#                                   Q      UREGEX_LITERAL flag.  Entire pattern is literal string.
 #                                   v      If icu configured without break iteration, this
 #                                          regex test pattern should not compile.
 #                                   e      set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
@@ -116,6 +117,24 @@
 "xyz$"                  Yz     "xyza"
 "xyz$"                  yz     "<0>xyz</0>"
 
+#
+#  HitEnd 
+#
+"abcd"                  Lz      "a"
+"abcd"                  Lz      "ab"
+"abcd"                  Lz      "abc"
+"abcd"                  LZ      "<0>abcd</0>"
+"abcd"                  LZ      "<0>abcd</0>e"
+"abcd"                  LZ      "abcx"
+"abcd"                  LZ      "abx"
+"abcd"                  Lzi     "a"
+"abcd"                  Lzi     "ab"
+"abcd"                  Lzi     "abc"
+"abcd"                  LZi     "<0>abcd</0>"
+"abcd"                  LZi     "<0>abcd</0>e"
+"abcd"                  LZi     "abcx"
+"abcd"                  LZi     "abx"
+
 #
 #  All Unicode line endings recognized.
 #     0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
@@ -176,6 +195,7 @@
 "(hello)|(goodbye)"            "<0><2>goodbye</2></0>"
 "abc( +(  inner(X?) +)  xyz)"  "leading cruft <0>abc<1>     <2>  inner<3></3>    </2>  xyz</1></0> cruft"
 "\s*([ixsmdt]*)([:letter:]*)"  "<0>   <1>d</1><2></2></0>  "
+"(a|b)c*d"                     "a<0><1>b</1>cd</0>"
 
 # Non-capturing parens (?: stuff).   Groups, but does not capture.
 "(?:abc)*(tail)"               "<0>abcabcabc<1>tail</1></0>"
@@ -215,6 +235,7 @@
 ".*?\b(.).*"                   "<0>  $%^&*( <1>h</1>ello123%^&*()gxx</0>"
 "\ba\b"                        "-<0>a</0>"
 "\by\b"                        "xy"
+"[ \b]"                        "<0>b</0>"     # in a set, \b is a literal b.
 
 # Finds first chars of up to 5 words
 "(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?"   "<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox"
@@ -260,6 +281,12 @@
 # \Q...\E quote mode
 "hel\Qlo, worl\Ed"             "<0>hello, world</0>"
 "\Q$*^^(*)?\A\E(a*)"           "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
+"[abc\Q]\r\E]+"                "<0>aaaccc]]]\\\\\\</0>\r..."   # \Q ... \E escape in a [set]
+
+# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized.
+#                  Note that data strings in test cases still get escape processing.
+"abc\an\r\E\\abcd\u0031bye"     Q  "lead<0>abc\\an\\r\\E\\\\abcd\\u0031bye</0>extra"
+"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral</0>"
 
 # \S and \s  space characters
 "\s+"                          "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
@@ -452,6 +479,40 @@
 "ab(?:c|(d?))(\1)"             "<0>ab<1></1><2></2></0>e"
 "ab(?:c|(d?))(\1)"             "<0>ab<1></1><2></2></0>"
 
+# Back References that hit/don't hit end
+"(abcd) \1"                z   "abcd abc"
+"(abcd) \1"                Z   "<0><1>abcd</1> abcd</0>"
+"(abcd) \1"                Z   "<0><1>abcd</1> abcd</0> "
+
+# Case Insensitve back references that hit/don't hit end.
+"(abcd) \1"                zi  "abcd abc"
+"(abcd) \1"                Zi  "<0><1>abcd</1> ABCD</0>"
+"(abcd) \1"                Zi  "<0><1>abcd</1> ABCD</0> "
+
+# Back references that hit/don't hit boundary limits.
+
+"(abcd) \1"                z   "<r>abcd abc</r>d "
+"(abcd) \1"                Z   "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1"                Z   "<r><0><1>abcd</1> abcd</0> </r>"
+
+"(abcd) \1"                zi  "<r>abcd abc</r>d "
+"(abcd) \1"                Zi  "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1"                Zi  "<r><0><1>abcd</1> abcd</0> </r>"
+
+# Back reference that fails match near the end of input without actually hitting the end.
+"(abcd) \1"                ZL  "abcd abd"
+"(abcd) \1"                ZLi "abcd abd"
+
+# Back reference to a zero-length match.  They are always a successful match.
+"ab(x?)cd(\1)ef"               "<0>ab<1></1>cd<2></2>ef</0>"
+"ab(x?)cd(\1)ef"            i  "<0>ab<1></1>cd<2></2>ef</0>"
+
+# Back refs to capture groups that didn't participate in the match.
+"ab(?:(c)|(d))\1"              "abde"
+"ab(?:(c)|(d))\1"              "<0>ab<1>c</1>c</0>e"
+"ab(?:(c)|(d))\1"            i "abde"
+"ab(?:(c)|(d))\1"            i "<0>ab<1>c</1>c</0>e"
+
 # Case Insensitive
 "aBc"                    i      "<0>ABC</0>"
 "a[^bc]d"                i      "ABD"
@@ -602,6 +663,7 @@
 "\ud800\udc00*"                   "<0>\U00010000\U00010000\U00010000</0>\U00010001"
 "\ud800\ud800\udc00"              "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
 "(\ud800)(\udc00)"                "\U00010000"
+"\U00010001+"                     "<0>\U00010001\U00010001</0>\udc01"
 
 #
 # hitEnd with find()
@@ -610,6 +672,12 @@
 "abc"                       2Z    "aaabc  <0>abc</0>ab"
 "abc"                       3z    "aa>abc  abcab"
 
+#
+# \ escaping
+#
+"abc\jkl"                         "<0>abcjkl</0>"    # escape of a non-special letter is just itself.
+"abc[ \j]kl"                      "<0>abcjkl</0>"
+
 #
 # Bug xxxx
 #
@@ -1024,11 +1092,63 @@
 "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "<0><1>94040</1><2>-3344</2></0>"
 "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "94040-0000"
 "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "00000-3344"
+
+#
+#    Bug 8666.  Assertion failure on match, bad operand to JMP_SAV_X opcode.
+#
+"((.??)+|A)*"                     "<0><1><2></2></1></0>AAAAABBBBBCCCCCDDDDEEEEE"
+
 #
+#    Bug 8826.  Incorrect results with case insensitive matches.
+#
+"AS(X)"                         i "aÃx"
+"AS.*"                          i "aÃx"           # Expansion of sharp s can't split between pattern terms.
+"ASÃS"                          i "<0>aÃÃ</0>"    # All one literal string, does match.
+"ASÃ{1}S"                       i "aÃÃ"           # Pattern with terms, no match.
+"aÃx"                           i "<0>assx</0>"
+"aÃx"                           i "<0>ASSX</0>"
+"aÃx"                           i "<0>aÃx</0>"
+"ASS(.)"                        i "<0>aÃ<1>x</1></0>"
+
+# Case Insensitive, probe some corner cases.
+"ass+"                          i "aÃ"            # Second 's' in pattern is qualified, can't combine with first.
+"as+"                           i "aÃ"
+"aÃs"                           i "as"            # Can't match half of a Ã
+"aÃ+"                           i "<0>assssssss</0>s"
+"aÃ+"                           i "<0>assÃSssSSS</0>s"
+"a(Ã?)+"                        i "<0>assssssss<1></1></0>s"
+"a(Ã?)+"                        i "<0>a<1></1></0>zzzzzzzzs"
+
+"\U00010400"                    i "<0>\U00010428</0>"   # case folded supplemental code point.
+
+"sstuff"                        i "<0>Ãtuff</0>"    # exercise optimizations on what chars can start a match.
+"sstuff"                        i "s<0>Ãtuff</0>"    # exercise optimizations on what chars can start a match.
+"Ãtuff"                         i "s<0>sstuff</0>"
+"Ãtuff"                         i "s<0>Sstuff</0>"
+
+"a(..)\1"                       i "<0>A<1>bc</1>BC</0>def"
+"(Ã)\1"                         i "aa<0><1>ss</1>Ã</0>zz"          # Case insensitive back reference
+"..(.)\1"                       i "<0>aa<1>Ã</1>ss</0>"
+"ab(..)\1"                      i "xx<0>ab<1>ss</1>Ã</0>ss" 
+
+" (ss) ((\1.*)|(.*))"           i "<0> <1>ss</1> <2><4>sÃ</4></2></0>"       # The back reference 'ss' must not match in 'sÃ'
+
+# Bug 9057
+#   \u200c and \u200d should be word characters.
+#
+"\w+"                             "  <0>abc\u200cdef\u200dghi</0>   "
+"\w+"                           i "  <0>abc\u200cdef\u200dghi</0>   "
+"[\w]+"                           "  <0>abc\u200cdef\u200dghi</0>   "
+"[\w]+"                         i "  <0>abc\u200cdef\u200dghi</0>   "
+
+# Bug 9283
+#  uregex_open fails for look-behind assertion + case-insensitive
+
+"(ab)?(?<=ab)cd|ef"             i  "<0><1>ab</1>cd</0>"
+
 #  Random debugging, Temporary
 #
 #"^(?:a?b?)*$"	                  "a--"
-"^(?:a?b?)*$"	                  "a--"
 
 "This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"
 "((?:a|b|c)whoop-dee-do) | [jkl]|zed"             "x"