]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/test/testdata/regextst.txt
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / test / testdata / regextst.txt
index 15d13bf156ea48750832fad9fe20a82e383c78e3..6873f4835c9d085a5cec663a42e82790fc6bcd40 100644 (file)
@@ -1,4 +1,6 @@
-# Copyright (c) 2001-2015 International Business Machines
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2001-2015 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #  file:
 
 "(?<=((0123456789){100000}){3000})abc"  E  "abc"
 
+# Bug 11507  Capture of an unpaired surrogate shouldn't allow a back reference to 
+#            match half of a surrogate pair, but only another unpaired surrogate.
+# 
+"pre(.)post\1"                  "pre\ud800post\ud800\udc00"
+"pre(.)post\1"                  "<0>pre<1>\ud800</1>post\ud800</0> fin"
+"pre(.)post\1"          i       "pre\ud800post\ud800\udc00"         # case insensiteve backrefs take a different code path
+"pre(.)post\1"          i       "<0>pre<1>\ud800</1>post\ud800</0> fin"
+
+# Bug 11554
+#
+#    Maximum match length computation was assuming UTF-16.
+#    Used in look-behind matches to constrain how far back to look.
+
+"(?<=a\x{100000})spam"          "***a\x{100000}<0>spam</0>**"
+"(?<=aą)spam"                   "**aą<0>spam</0>**"
+"(?<=ąabc)spam"                 "**ąabc<0>spam</0>**"
+
+"(?<=a\x{100000})spam"          "***a\x{100001}spam**"
+"(?<=aą)spam"                   "**bąspam**"
+"(?<=ąabc)spam"                 "**ąabxspam**"
+
+# with negative look-behind
+
+"(?<!a\x{100000})spam"          "***a\x{100000}spam**"
+"(?<!aą)spam"                   "**aąspam**"
+"(?<!ąabc)spam"                 "**ąabcspam**"
+
+"(?<!a\x{100000})spam"          "***a\x{100001}<0>spam</0>**"
+"(?<!aą)spam"                   "**bą<0>spam</0>**"
+"(?<!ąabc)spam"                 "**ąabx<0>spam</0>**"
+
+# Bug #12930
+#
+#   Minimum Match Length computation, int32_t overflow on an empty set in the pattern.
+#   The empty set, with no match possible, has a min match length of INT32_MAX.
+#   Was incremented subsequently. Caused assertion failure on pattern compile.
+
+"[^\u0000-\U0010ffff]bc?"       "bc no match"
+"[^\u0000-\U0010ffff]?bc?"      "<0>bc</0> has a match"
 
 
 #  Random debugging, Temporary