X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..51004dcb01e06fef634b61be77ed73dd61cb6db9:/icuSources/test/testdata/regextst.txt diff --git a/icuSources/test/testdata/regextst.txt b/icuSources/test/testdata/regextst.txt index 3de486ac..53bd73a7 100644 --- a/icuSources/test/testdata/regextst.txt +++ b/icuSources/test/testdata/regextst.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2001-2010 International Business Machines +# Copyright (c) 2001-2012 International Business Machines # Corporation and others. All Rights Reserved. # # file: @@ -13,8 +13,8 @@ # = text, with the start and end of each # capture group tagged with .... The overall match, # if any, is group 0, as in <0>matched text -# # A region can be specified with ... tags. +# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear. # # = any combination of # i case insensitive match @@ -23,6 +23,7 @@ # m multi-line mode. # ($ and ^ match at embedded new-lines) # D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. # v If icu configured without break iteration, this # regex test pattern should not compile. # e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag @@ -116,6 +117,24 @@ "xyz$" Yz "xyza" "xyz$" yz "<0>xyz" +# +# HitEnd +# +"abcd" Lz "a" +"abcd" Lz "ab" +"abcd" Lz "abc" +"abcd" LZ "<0>abcd" +"abcd" LZ "<0>abcde" +"abcd" LZ "abcx" +"abcd" LZ "abx" +"abcd" Lzi "a" +"abcd" Lzi "ab" +"abcd" Lzi "abc" +"abcd" LZi "<0>abcd" +"abcd" LZi "<0>abcde" +"abcd" LZi "abcx" +"abcd" LZi "abx" + # # All Unicode line endings recognized. # 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029 @@ -176,6 +195,7 @@ "(hello)|(goodbye)" "<0><2>goodbye" "abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3> xyz cruft" "\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d<2> " +"(a|b)c*d" "a<0><1>bcd" # Non-capturing parens (?: stuff). Groups, but does not capture. "(?:abc)*(tail)" "<0>abcabcabc<1>tail" @@ -215,6 +235,7 @@ ".*?\b(.).*" "<0> $%^&*( <1>hello123%^&*()gxx" "\ba\b" "-<0>a" "\by\b" "xy" +"[ \b]" "<0>b" # in a set, \b is a literal b. # Finds first chars of up to 5 words "(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>Tthe <2>qick <3>brown <4>fox" @@ -260,6 +281,12 @@ # \Q...\E quote mode "hel\Qlo, worl\Ed" "<0>hello, world" "\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa" +"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\\r..." # \Q ... \E escape in a [set] + +# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized. +# Note that data strings in test cases still get escape processing. +"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031byeextra" +"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral" # \S and \s space characters "\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029xyz" @@ -452,6 +479,40 @@ "ab(?:c|(d?))(\1)" "<0>ab<1><2>e" "ab(?:c|(d?))(\1)" "<0>ab<1><2>" +# Back References that hit/don't hit end +"(abcd) \1" z "abcd abc" +"(abcd) \1" Z "<0><1>abcd abcd" +"(abcd) \1" Z "<0><1>abcd abcd " + +# Case Insensitve back references that hit/don't hit end. +"(abcd) \1" zi "abcd abc" +"(abcd) \1" Zi "<0><1>abcd ABCD" +"(abcd) \1" Zi "<0><1>abcd ABCD " + +# Back references that hit/don't hit boundary limits. + +"(abcd) \1" z "abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " + +"(abcd) \1" zi "abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " + +# Back reference that fails match near the end of input without actually hitting the end. +"(abcd) \1" ZL "abcd abd" +"(abcd) \1" ZLi "abcd abd" + +# Back reference to a zero-length match. They are always a successful match. +"ab(x?)cd(\1)ef" "<0>ab<1>cd<2>ef" +"ab(x?)cd(\1)ef" i "<0>ab<1>cd<2>ef" + +# Back refs to capture groups that didn't participate in the match. +"ab(?:(c)|(d))\1" "abde" +"ab(?:(c)|(d))\1" "<0>ab<1>cce" +"ab(?:(c)|(d))\1" i "abde" +"ab(?:(c)|(d))\1" i "<0>ab<1>cce" + # Case Insensitive "aBc" i "<0>ABC" "a[^bc]d" i "ABD" @@ -602,6 +663,7 @@ "\ud800\udc00*" "<0>\U00010000\U00010000\U00010000\U00010001" "\ud800\ud800\udc00" "<0>\ud800\U00010000\U00010000\U00010000\U00010001" "(\ud800)(\udc00)" "\U00010000" +"\U00010001+" "<0>\U00010001\U00010001\udc01" # # hitEnd with find() @@ -610,6 +672,12 @@ "abc" 2Z "aaabc <0>abcab" "abc" 3z "aa>abc abcab" +# +# \ escaping +# +"abc\jkl" "<0>abcjkl" # escape of a non-special letter is just itself. +"abc[ \j]kl" "<0>abcjkl" + # # Bug xxxx # @@ -1024,11 +1092,63 @@ "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "<0><1>94040<2>-3344" "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "94040-0000" "(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "00000-3344" + +# +# Bug 8666. Assertion failure on match, bad operand to JMP_SAV_X opcode. +# +"((.??)+|A)*" "<0><1><2>AAAAABBBBBCCCCCDDDDEEEEE" + # +# Bug 8826. Incorrect results with case insensitive matches. +# +"AS(X)" i "aßx" +"AS.*" i "aßx" # Expansion of sharp s can't split between pattern terms. +"ASßS" i "<0>aßß" # All one literal string, does match. +"ASß{1}S" i "aßß" # Pattern with terms, no match. +"aßx" i "<0>assx" +"aßx" i "<0>ASSX" +"aßx" i "<0>aßx" +"ASS(.)" i "<0>aß<1>x" + +# Case Insensitive, probe some corner cases. +"ass+" i "aß" # Second 's' in pattern is qualified, can't combine with first. +"as+" i "aß" +"aßs" i "as" # Can't match half of a ß +"aß+" i "<0>asssssssss" +"aß+" i "<0>assßSssSSSs" +"a(ß?)+" i "<0>assssssss<1>s" +"a(ß?)+" i "<0>a<1>zzzzzzzzs" + +"\U00010400" i "<0>\U00010428" # case folded supplemental code point. + +"sstuff" i "<0>ßtuff" # exercise optimizations on what chars can start a match. +"sstuff" i "s<0>ßtuff" # exercise optimizations on what chars can start a match. +"ßtuff" i "s<0>sstuff" +"ßtuff" i "s<0>Sstuff" + +"a(..)\1" i "<0>A<1>bcBCdef" +"(ß)\1" i "aa<0><1>ssßzz" # Case insensitive back reference +"..(.)\1" i "<0>aa<1>ßss" +"ab(..)\1" i "xx<0>ab<1>ssßss" + +" (ss) ((\1.*)|(.*))" i "<0> <1>ss <2><4>sß" # The back reference 'ss' must not match in 'sß' + +# Bug 9057 +# \u200c and \u200d should be word characters. +# +"\w+" " <0>abc\u200cdef\u200dghi " +"\w+" i " <0>abc\u200cdef\u200dghi " +"[\w]+" " <0>abc\u200cdef\u200dghi " +"[\w]+" i " <0>abc\u200cdef\u200dghi " + +# Bug 9283 +# uregex_open fails for look-behind assertion + case-insensitive + +"(ab)?(?<=ab)cd|ef" i "<0><1>abcd" + # Random debugging, Temporary # #"^(?:a?b?)*$" "a--" -"^(?:a?b?)*$" "a--" "This is a string with (?:one |two |three )endings" "<0>This is a string with two endings" "((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"