ICU-491.11.1.tar.gz

[apple/icu.git] / icuSources / test / testdata / regextst.txt
diff --git a/icuSources/test/testdata/regextst.txt b/icuSources/test/testdata/regextst.txt

index 3de486ac54903870ea8c795874657675848b2eec..53bd73a7ef3f37435c5df6fd134fc71d923393db 100644 (file)
--- a/icuSources/test/testdata/regextst.txt
+++ b/icuSources/test/testdata/regextst.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2010 International Business Machines
+# Copyright (c) 2001-2012 International Business Machines
  # Corporation and others. All Rights Reserved.
  #
  #  file:
@@ -13,8 +13,8 @@
  #               <tagged string> = text, with the start and end of each
  #                                 capture group tagged with <n>...</n>.  The overall match,
  #                                 if any, is group 0, as in <0>matched text</0>
-#
  #                                  A region can be specified with <r>...</r> tags.
+#                                 Standard ICU unescape will be applied, allowing \u, \U, etc. to appear.
  #
  #               <flags>         = any combination of
  #                                   i      case insensitive match
@@ -23,6 +23,7 @@
  #                                   m      multi-line mode.  
  #                                            ($ and ^ match at embedded new-lines)
  #                                   D      Unix Lines mode (only recognize 0x0a as new-line)
+#                                   Q      UREGEX_LITERAL flag.  Entire pattern is literal string.
  #                                   v      If icu configured without break iteration, this
  #                                          regex test pattern should not compile.
  #                                   e      set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
@@ -116,6 +117,24 @@
  "xyz$"                  Yz     "xyza"
  "xyz$"                  yz     "<0>xyz</0>"
  
+#
+#  HitEnd 
+#
+"abcd"                  Lz      "a"
+"abcd"                  Lz      "ab"
+"abcd"                  Lz      "abc"
+"abcd"                  LZ      "<0>abcd</0>"
+"abcd"                  LZ      "<0>abcd</0>e"
+"abcd"                  LZ      "abcx"
+"abcd"                  LZ      "abx"
+"abcd"                  Lzi     "a"
+"abcd"                  Lzi     "ab"
+"abcd"                  Lzi     "abc"
+"abcd"                  LZi     "<0>abcd</0>"
+"abcd"                  LZi     "<0>abcd</0>e"
+"abcd"                  LZi     "abcx"
+"abcd"                  LZi     "abx"
+
  #
  #  All Unicode line endings recognized.
  #     0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
@@ -176,6 +195,7 @@
  "(hello)|(goodbye)"            "<0><2>goodbye</2></0>"
  "abc( +(  inner(X?) +)  xyz)"  "leading cruft <0>abc<1>     <2>  inner<3></3>    </2>  xyz</1></0> cruft"
  "\s*([ixsmdt]*)([:letter:]*)"  "<0>   <1>d</1><2></2></0>  "
+"(a|b)c*d"                     "a<0><1>b</1>cd</0>"
  
  # Non-capturing parens (?: stuff).   Groups, but does not capture.
  "(?:abc)*(tail)"               "<0>abcabcabc<1>tail</1></0>"
@@ -215,6 +235,7 @@
  ".*?\b(.).*"                   "<0>  $%^&*( <1>h</1>ello123%^&*()gxx</0>"
  "\ba\b"                        "-<0>a</0>"
  "\by\b"                        "xy"
+"[ \b]"                        "<0>b</0>"     # in a set, \b is a literal b.
  
  # Finds first chars of up to 5 words
  "(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?"   "<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox"
@@ -260,6 +281,12 @@
  # \Q...\E quote mode
  "hel\Qlo, worl\Ed"             "<0>hello, world</0>"
  "\Q$*^^(*)?\A\E(a*)"           "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
+"[abc\Q]\r\E]+"                "<0>aaaccc]]]\\\\\\</0>\r..."   # \Q ... \E escape in a [set]
+
+# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized.
+#                  Note that data strings in test cases still get escape processing.
+"abc\an\r\E\\abcd\u0031bye"     Q  "lead<0>abc\\an\\r\\E\\\\abcd\\u0031bye</0>extra"
+"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral</0>"
  
  # \S and \s  space characters
  "\s+"                          "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
@@ -452,6 +479,40 @@
  "ab(?:c|(d?))(\1)"             "<0>ab<1></1><2></2></0>e"
  "ab(?:c|(d?))(\1)"             "<0>ab<1></1><2></2></0>"
  
+# Back References that hit/don't hit end
+"(abcd) \1"                z   "abcd abc"
+"(abcd) \1"                Z   "<0><1>abcd</1> abcd</0>"
+"(abcd) \1"                Z   "<0><1>abcd</1> abcd</0> "
+
+# Case Insensitve back references that hit/don't hit end.
+"(abcd) \1"                zi  "abcd abc"
+"(abcd) \1"                Zi  "<0><1>abcd</1> ABCD</0>"
+"(abcd) \1"                Zi  "<0><1>abcd</1> ABCD</0> "
+
+# Back references that hit/don't hit boundary limits.
+
+"(abcd) \1"                z   "<r>abcd abc</r>d "
+"(abcd) \1"                Z   "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1"                Z   "<r><0><1>abcd</1> abcd</0> </r>"
+
+"(abcd) \1"                zi  "<r>abcd abc</r>d "
+"(abcd) \1"                Zi  "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1"                Zi  "<r><0><1>abcd</1> abcd</0> </r>"
+
+# Back reference that fails match near the end of input without actually hitting the end.
+"(abcd) \1"                ZL  "abcd abd"
+"(abcd) \1"                ZLi "abcd abd"
+
+# Back reference to a zero-length match.  They are always a successful match.
+"ab(x?)cd(\1)ef"               "<0>ab<1></1>cd<2></2>ef</0>"
+"ab(x?)cd(\1)ef"            i  "<0>ab<1></1>cd<2></2>ef</0>"
+
+# Back refs to capture groups that didn't participate in the match.
+"ab(?:(c)|(d))\1"              "abde"
+"ab(?:(c)|(d))\1"              "<0>ab<1>c</1>c</0>e"
+"ab(?:(c)|(d))\1"            i "abde"
+"ab(?:(c)|(d))\1"            i "<0>ab<1>c</1>c</0>e"
+
  # Case Insensitive
  "aBc"                    i      "<0>ABC</0>"
  "a[^bc]d"                i      "ABD"
@@ -602,6 +663,7 @@
  "\ud800\udc00*"                   "<0>\U00010000\U00010000\U00010000</0>\U00010001"
  "\ud800\ud800\udc00"              "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
  "(\ud800)(\udc00)"                "\U00010000"
+"\U00010001+"                     "<0>\U00010001\U00010001</0>\udc01"
  
  #
  # hitEnd with find()
@@ -610,6 +672,12 @@
  "abc"                       2Z    "aaabc  <0>abc</0>ab"
  "abc"                       3z    "aa>abc  abcab"
  
+#
+# \ escaping
+#
+"abc\jkl"                         "<0>abcjkl</0>"    # escape of a non-special letter is just itself.
+"abc[ \j]kl"                      "<0>abcjkl</0>"
+
  #
  # Bug xxxx
  #
@@ -1024,11 +1092,63 @@
  "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "<0><1>94040</1><2>-3344</2></0>"
  "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "94040-0000"
  "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?"    "00000-3344"
+
+#
+#    Bug 8666.  Assertion failure on match, bad operand to JMP_SAV_X opcode.
+#
+"((.??)+|A)*"                     "<0><1><2></2></1></0>AAAAABBBBBCCCCCDDDDEEEEE"
+
  #
+#    Bug 8826.  Incorrect results with case insensitive matches.
+#
+"AS(X)"                         i "aßx"
+"AS.*"                          i "aßx"           # Expansion of sharp s can't split between pattern terms.
+"ASßS"                          i "<0>aßß</0>"    # All one literal string, does match.
+"ASß{1}S"                       i "aßß"           # Pattern with terms, no match.
+"aßx"                           i "<0>assx</0>"
+"aßx"                           i "<0>ASSX</0>"
+"aßx"                           i "<0>aßx</0>"
+"ASS(.)"                        i "<0>aß<1>x</1></0>"
+
+# Case Insensitive, probe some corner cases.
+"ass+"                          i "aß"            # Second 's' in pattern is qualified, can't combine with first.
+"as+"                           i "aß"
+"aßs"                           i "as"            # Can't match half of a ß
+"aß+"                           i "<0>assssssss</0>s"
+"aß+"                           i "<0>assßSssSSS</0>s"
+"a(ß?)+"                        i "<0>assssssss<1></1></0>s"
+"a(ß?)+"                        i "<0>a<1></1></0>zzzzzzzzs"
+
+"\U00010400"                    i "<0>\U00010428</0>"   # case folded supplemental code point.
+
+"sstuff"                        i "<0>ßtuff</0>"    # exercise optimizations on what chars can start a match.
+"sstuff"                        i "s<0>ßtuff</0>"    # exercise optimizations on what chars can start a match.
+"ßtuff"                         i "s<0>sstuff</0>"
+"ßtuff"                         i "s<0>Sstuff</0>"
+
+"a(..)\1"                       i "<0>A<1>bc</1>BC</0>def"
+"(ß)\1"                         i "aa<0><1>ss</1>ß</0>zz"          # Case insensitive back reference
+"..(.)\1"                       i "<0>aa<1>ß</1>ss</0>"
+"ab(..)\1"                      i "xx<0>ab<1>ss</1>ß</0>ss" 
+
+" (ss) ((\1.*)|(.*))"           i "<0> <1>ss</1> <2><4>sß</4></2></0>"       # The back reference 'ss' must not match in 'sß'
+
+# Bug 9057
+#   \u200c and \u200d should be word characters.
+#
+"\w+"                             "  <0>abc\u200cdef\u200dghi</0>   "
+"\w+"                           i "  <0>abc\u200cdef\u200dghi</0>   "
+"[\w]+"                           "  <0>abc\u200cdef\u200dghi</0>   "
+"[\w]+"                         i "  <0>abc\u200cdef\u200dghi</0>   "
+
+# Bug 9283
+#  uregex_open fails for look-behind assertion + case-insensitive
+
+"(ab)?(?<=ab)cd|ef"             i  "<0><1>ab</1>cd</0>"
+
  #  Random debugging, Temporary
  #
  #"^(?:a?b?)*$"                   "a--"
-"^(?:a?b?)*$"                    "a--"
  
  "This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"
  "((?:a|b|c)whoop-dee-do) | [jkl]|zed"             "x"