-# Copyright (c) 2001-2013 International Business Machines
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2001-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
"ab(?:(c)|(d))\1" i "abde"
"ab(?:(c)|(d))\1" i "<0>ab<1>c</1>c</0>e"
+# Named back references
+"(?<one>abcd)\k<one>" "<0><1>abcd</1>abcd</0>"
+"(no)?(?<one>abcd)\k<one>" "<0><2>abcd</2>abcd</0>"
+
+"(?<a_1>...)" E " " # backref names are ascii letters & numbers only"
+"(?<1a>...)" E " " # backref names must begin with a letter"
+"(?<a>.)(?<a>.)" E " " # Repeated names are illegal.
+
+
# Case Insensitive
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
"(?:(?i)a)b" "<0>Ab</0>"
-"ab(?i)cd" "<0>abCd</0>"
+"ab(?i)cd" "<0>abCd</0>"
"ab$cd" "abcd"
+"ssl" i "abc<0>ßl</0>xyz"
+"ssl" i "abc<0>ẞl</0>xyz"
+"FIND" i "can <0>find</0> ?" # fi ligature, \ufb01
+"find" i "can <0>FIND</0> ?"
+"ῧ" i "xxx<0>ῧ</0>xxx" # Composed char (match string) decomposes when case-folded (pattern)
+
# White space handling
"a b" "ab"
"abc " "abc"
"abc\jkl" "<0>abcjkl</0>" # escape of a non-special letter is just itself.
"abc[ \j]kl" "<0>abcjkl</0>"
+#
+# \R all newline sequences.
+#
+"abc\Rxyz" "<0>abc\u000axyz</0>gh"
+"abc\Rxyz" "<0>abc\u000bxyz</0>gh"
+"abc\Rxyz" "<0>abc\u000cxyz</0>gh"
+"abc\Rxyz" "<0>abc\u000dxyz</0>gh"
+"abc\Rxyz" "<0>abc\u0085xyz</0>gh"
+"abc\Rxyz" "<0>abc\u2028xyz</0>gh"
+"abc\Rxyz" "<0>abc\u2029xyz</0>gh"
+"abc\Rxyz" "<0>abc\u000d\u000axyz</0>gh"
+
+"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence.
+"abc\r\nxyz" "<0>abc\u000d\u000axyz</0>gh"
+
+"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches.
+"abc\Rxyz" "abc\u000exyz"
+"abc\Rxyz" "abc\u202axyz"
+
+# \v \V single character new line sequences.
+
+"abc\vxyz" "<0>abc\u000axyz</0>gh"
+"abc\vxyz" "<0>abc\u000bxyz</0>gh"
+"abc\vxyz" "<0>abc\u000cxyz</0>gh"
+"abc\vxyz" "<0>abc\u000dxyz</0>gh"
+"abc\vxyz" "<0>abc\u0085xyz</0>gh"
+"abc\vxyz" "<0>abc\u2028xyz</0>gh"
+"abc\vxyz" "<0>abc\u2029xyz</0>gh"
+"abc\vxyz" "abc\u000d\u000axyzgh"
+"abc\vxyz" "abc?xyzgh"
+
+"abc[\v]xyz" "<0>abc\u000axyz</0>gh"
+"abc[\v]xyz" "<0>abc\u000bxyz</0>gh"
+"abc[\v]xyz" "<0>abc\u000cxyz</0>gh"
+"abc[\v]xyz" "<0>abc\u000dxyz</0>gh"
+"abc[\v]xyz" "<0>abc\u0085xyz</0>gh"
+"abc[\v]xyz" "<0>abc\u2028xyz</0>gh"
+"abc[\v]xyz" "<0>abc\u2029xyz</0>gh"
+"abc[\v]xyz" "abc\u000d\u000axyzgh"
+"abc[\v]xyz" "abc?xyzgh"
+
+"abc\Vxyz" "abc\u000axyzgh"
+"abc\Vxyz" "abc\u000bxyzgh"
+"abc\Vxyz" "abc\u000cxyzgh"
+"abc\Vxyz" "abc\u000dxyzgh"
+"abc\Vxyz" "abc\u0085xyzgh"
+"abc\Vxyz" "abc\u2028xyzgh"
+"abc\Vxyz" "abc\u2029xyzgh"
+"abc\Vxyz" "abc\u000d\u000axyzgh"
+"abc\Vxyz" "<0>abc?xyz</0>gh"
+
+# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab
+
+"abc\hxyz" "<0>abc xyz</0>gh"
+"abc\Hxyz" "abc xyzgh"
+"abc\hxyz" "<0>abc\u2003xyz</0>gh"
+"abc\Hxyz" "abc\u2003xyzgh"
+"abc\hxyz" "<0>abc\u0009xyz</0>gh"
+"abc\Hxyz" "abc\u0009xyzgh"
+"abc\hxyz" "abc?xyzgh"
+"abc\Hxyz" "<0>abc?xyz</0>gh"
+
+"abc[\h]xyz" "<0>abc xyz</0>gh"
+"abc[\H]xyz" "abc xyzgh"
+"abc[\h]xyz" "<0>abc\u2003xyz</0>gh"
+"abc[\H]xyz" "abc\u2003xyzgh"
+"abc[\h]xyz" "<0>abc\u0009xyz</0>gh"
+"abc[\H]xyz" "abc\u0009xyzgh"
+"abc[\h]xyz" "abc?xyzgh"
+"abc[\H]xyz" "<0>abc?xyz</0>gh"
+
+
#
# Bug xxxx
#
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
+# Bug 10835
+# Match Start Set not being correctly computed for case insensitive patterns.
+# (Test here is to dump the compiled pattern & manually check the start set.)
+
+"(private|secret|confidential|classified|restricted)" i "hmm, <0><1>Classified</1></0> stuff"
+"(private|secret|confidential|classified|restricted)" "hmm, Classified stuff"
+
+# Bug 10844
+
+"^([\w\d:]+)$" "<0><1>DiesIst1Beispiel:text</1></0>"
+"^([\w\d:]+)$" i "<0><1>DiesIst1Beispiel:text</1></0>"
+"^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text</1></0>"
+"^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text</1></0>"
+
+# Bug 11049
+# Edge cases in find() when pattern match begins with set of code points
+# and the match begins at the end of the string.
+
+"A|B|C" "hello <0>A</0>"
+"A|B|C" "hello \U00011234"
+"A|B|\U00012345" "hello <0>\U00012345</0>"
+"A|B|\U00010000" "hello \ud800"
+
+# Bug 11369
+# Incorrect optimization of patterns with a zero length quantifier {0}
+
+"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE"
+"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>"
+"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>"
+"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>"
+"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>"
+
+# Bug 11370
+# Max match length computation of look-behind expression gives result that is too big to fit in the
+# in the 24 bit operand portion of the compiled code. Expressions should fail to compile
+# (Look-behind match length must be bounded. This case is treated as unbounded, an error.)
+
+"(?<!(0123456789a){10000000})x" E "no match"
+"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match"
+
+# Bug 11374 Bad integer overflow check in number conversion.
+# 4294967300 converts to 4 with 32 bit overflow.
+
+"x{4294967300}" E "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+"x{0,4294967300}" E "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+
+# Bug 11373
+#
+# Overflow checking in max match length computation for loops.
+# Value here is 10 * 100000 * 3000 = 3E9, overflowing a 32 bit signed value.
+# Before fixing, this case gave an assertion failure.
+
+"(?<=((0123456789){100000}){3000})abc" E "abc"
+
+# Bug 11507 Capture of an unpaired surrogate shouldn't allow a back reference to
+# match half of a surrogate pair, but only another unpaired surrogate.
+#
+"pre(.)post\1" "pre\ud800post\ud800\udc00"
+"pre(.)post\1" "<0>pre<1>\ud800</1>post\ud800</0> fin"
+"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path
+"pre(.)post\1" i "<0>pre<1>\ud800</1>post\ud800</0> fin"
+
+# Bug 11554
+#
+# Maximum match length computation was assuming UTF-16.
+# Used in look-behind matches to constrain how far back to look.
+
+"(?<=a\x{100000})spam" "***a\x{100000}<0>spam</0>**"
+"(?<=aą)spam" "**aą<0>spam</0>**"
+"(?<=ąabc)spam" "**ąabc<0>spam</0>**"
+
+"(?<=a\x{100000})spam" "***a\x{100001}spam**"
+"(?<=aą)spam" "**bąspam**"
+"(?<=ąabc)spam" "**ąabxspam**"
+
+# with negative look-behind
+
+"(?<!a\x{100000})spam" "***a\x{100000}spam**"
+"(?<!aą)spam" "**aąspam**"
+"(?<!ąabc)spam" "**ąabcspam**"
+
+"(?<!a\x{100000})spam" "***a\x{100001}<0>spam</0>**"
+"(?<!aą)spam" "**bą<0>spam</0>**"
+"(?<!ąabc)spam" "**ąabx<0>spam</0>**"
+
+# Bug #12930
+#
+# Minimum Match Length computation, int32_t overflow on an empty set in the pattern.
+# The empty set, with no match possible, has a min match length of INT32_MAX.
+# Was incremented subsequently. Caused assertion failure on pattern compile.
+
+"[^\u0000-\U0010ffff]bc?" "bc no match"
+"[^\u0000-\U0010ffff]?bc?" "<0>bc</0> has a match"
+
# Random debugging, Temporary
#
-#"^(?:a?b?)*$" "a--"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
-"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"
-"astring|another[bcd]|alpha|a|[a]" "x"
#