-# Copyright (c) 2001-2010 International Business Machines
+# Copyright (c) 2001-2012 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
# <tagged string> = text, with the start and end of each
# capture group tagged with <n>...</n>. The overall match,
# if any, is group 0, as in <0>matched text</0>
-#
# A region can be specified with <r>...</r> tags.
+# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear.
#
# <flags> = any combination of
# i case insensitive match
# m multi-line mode.
# ($ and ^ match at embedded new-lines)
# D Unix Lines mode (only recognize 0x0a as new-line)
+# Q UREGEX_LITERAL flag. Entire pattern is literal string.
# v If icu configured without break iteration, this
# regex test pattern should not compile.
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
"xyz$" Yz "xyza"
"xyz$" yz "<0>xyz</0>"
+#
+# HitEnd
+#
+"abcd" Lz "a"
+"abcd" Lz "ab"
+"abcd" Lz "abc"
+"abcd" LZ "<0>abcd</0>"
+"abcd" LZ "<0>abcd</0>e"
+"abcd" LZ "abcx"
+"abcd" LZ "abx"
+"abcd" Lzi "a"
+"abcd" Lzi "ab"
+"abcd" Lzi "abc"
+"abcd" LZi "<0>abcd</0>"
+"abcd" LZi "<0>abcd</0>e"
+"abcd" LZi "abcx"
+"abcd" LZi "abx"
+
#
# All Unicode line endings recognized.
# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
"(hello)|(goodbye)" "<0><2>goodbye</2></0>"
"abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3></3> </2> xyz</1></0> cruft"
"\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d</1><2></2></0> "
+"(a|b)c*d" "a<0><1>b</1>cd</0>"
# Non-capturing parens (?: stuff). Groups, but does not capture.
"(?:abc)*(tail)" "<0>abcabcabc<1>tail</1></0>"
".*?\b(.).*" "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>"
"\ba\b" "-<0>a</0>"
"\by\b" "xy"
+"[ \b]" "<0>b</0>" # in a set, \b is a literal b.
# Finds first chars of up to 5 words
"(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox"
# \Q...\E quote mode
"hel\Qlo, worl\Ed" "<0>hello, world</0>"
"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
+"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\</0>\r..." # \Q ... \E escape in a [set]
+
+# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized.
+# Note that data strings in test cases still get escape processing.
+"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031bye</0>extra"
+"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral</0>"
# \S and \s space characters
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>e"
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>"
+# Back References that hit/don't hit end
+"(abcd) \1" z "abcd abc"
+"(abcd) \1" Z "<0><1>abcd</1> abcd</0>"
+"(abcd) \1" Z "<0><1>abcd</1> abcd</0> "
+
+# Case Insensitve back references that hit/don't hit end.
+"(abcd) \1" zi "abcd abc"
+"(abcd) \1" Zi "<0><1>abcd</1> ABCD</0>"
+"(abcd) \1" Zi "<0><1>abcd</1> ABCD</0> "
+
+# Back references that hit/don't hit boundary limits.
+
+"(abcd) \1" z "<r>abcd abc</r>d "
+"(abcd) \1" Z "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1" Z "<r><0><1>abcd</1> abcd</0> </r>"
+
+"(abcd) \1" zi "<r>abcd abc</r>d "
+"(abcd) \1" Zi "<r><0><1>abcd</1> abcd</0></r> "
+"(abcd) \1" Zi "<r><0><1>abcd</1> abcd</0> </r>"
+
+# Back reference that fails match near the end of input without actually hitting the end.
+"(abcd) \1" ZL "abcd abd"
+"(abcd) \1" ZLi "abcd abd"
+
+# Back reference to a zero-length match. They are always a successful match.
+"ab(x?)cd(\1)ef" "<0>ab<1></1>cd<2></2>ef</0>"
+"ab(x?)cd(\1)ef" i "<0>ab<1></1>cd<2></2>ef</0>"
+
+# Back refs to capture groups that didn't participate in the match.
+"ab(?:(c)|(d))\1" "abde"
+"ab(?:(c)|(d))\1" "<0>ab<1>c</1>c</0>e"
+"ab(?:(c)|(d))\1" i "abde"
+"ab(?:(c)|(d))\1" i "<0>ab<1>c</1>c</0>e"
+
# Case Insensitive
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"
"\ud800\udc00*" "<0>\U00010000\U00010000\U00010000</0>\U00010001"
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
"(\ud800)(\udc00)" "\U00010000"
+"\U00010001+" "<0>\U00010001\U00010001</0>\udc01"
#
# hitEnd with find()
"abc" 2Z "aaabc <0>abc</0>ab"
"abc" 3z "aa>abc abcab"
+#
+# \ escaping
+#
+"abc\jkl" "<0>abcjkl</0>" # escape of a non-special letter is just itself.
+"abc[ \j]kl" "<0>abcjkl</0>"
+
#
# Bug xxxx
#
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "<0><1>94040</1><2>-3344</2></0>"
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "94040-0000"
"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "00000-3344"
+
+#
+# Bug 8666. Assertion failure on match, bad operand to JMP_SAV_X opcode.
+#
+"((.??)+|A)*" "<0><1><2></2></1></0>AAAAABBBBBCCCCCDDDDEEEEE"
+
#
+# Bug 8826. Incorrect results with case insensitive matches.
+#
+"AS(X)" i "aßx"
+"AS.*" i "aßx" # Expansion of sharp s can't split between pattern terms.
+"ASßS" i "<0>aßß</0>" # All one literal string, does match.
+"ASß{1}S" i "aßß" # Pattern with terms, no match.
+"aßx" i "<0>assx</0>"
+"aßx" i "<0>ASSX</0>"
+"aßx" i "<0>aßx</0>"
+"ASS(.)" i "<0>aß<1>x</1></0>"
+
+# Case Insensitive, probe some corner cases.
+"ass+" i "aß" # Second 's' in pattern is qualified, can't combine with first.
+"as+" i "aß"
+"aßs" i "as" # Can't match half of a ß
+"aß+" i "<0>assssssss</0>s"
+"aß+" i "<0>assßSssSSS</0>s"
+"a(ß?)+" i "<0>assssssss<1></1></0>s"
+"a(ß?)+" i "<0>a<1></1></0>zzzzzzzzs"
+
+"\U00010400" i "<0>\U00010428</0>" # case folded supplemental code point.
+
+"sstuff" i "<0>ßtuff</0>" # exercise optimizations on what chars can start a match.
+"sstuff" i "s<0>ßtuff</0>" # exercise optimizations on what chars can start a match.
+"ßtuff" i "s<0>sstuff</0>"
+"ßtuff" i "s<0>Sstuff</0>"
+
+"a(..)\1" i "<0>A<1>bc</1>BC</0>def"
+"(ß)\1" i "aa<0><1>ss</1>ß</0>zz" # Case insensitive back reference
+"..(.)\1" i "<0>aa<1>ß</1>ss</0>"
+"ab(..)\1" i "xx<0>ab<1>ss</1>ß</0>ss"
+
+" (ss) ((\1.*)|(.*))" i "<0> <1>ss</1> <2><4>sß</4></2></0>" # The back reference 'ss' must not match in 'sß'
+
+# Bug 9057
+# \u200c and \u200d should be word characters.
+#
+"\w+" " <0>abc\u200cdef\u200dghi</0> "
+"\w+" i " <0>abc\u200cdef\u200dghi</0> "
+"[\w]+" " <0>abc\u200cdef\u200dghi</0> "
+"[\w]+" i " <0>abc\u200cdef\u200dghi</0> "
+
+# Bug 9283
+# uregex_open fails for look-behind assertion + case-insensitive
+
+"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
+
# Random debugging, Temporary
#
#"^(?:a?b?)*$" "a--"
-"^(?:a?b?)*$" "a--"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"