1 # regular expression test set
 
   2 # Lines are at least three fields, separated by one or more tabs.  "" stands
 
   3 # for an empty field.  First field is an RE.  Second field is flags.  If
 
   4 # C flag given, regcomp() is expected to fail, and the third field is the
 
   5 # error name (minus the leading REG_).
 
   7 # Otherwise it is expected to succeed, and the third field is the string to
 
   8 # try matching it against.  If there is no fourth field, the match is
 
   9 # expected to fail.  If there is a fourth field, it is the substring that
 
  10 # the RE is expected to match.  If there is a fifth field, it is a comma-
 
  11 # separated list of what the subexpressions should match, with - indicating
 
  12 # no match for that one.  In both the fourth and fifth fields, a (sub)field
 
  13 # starting with @ indicates that the (sub)expression is expected to match
 
  14 # a null string followed by the stuff after the @; this provides a way to
 
  15 # test where null strings match.  The character `N' in REs and strings
 
  16 # is newline, `S' is space, `T' is tab, `Z' is NUL.
 
  18 # The full list of flags:
 
  19 #       -       placeholder, does nothing
 
  20 #       b       RE is a BRE, not an ERE
 
  21 #       &       try it as both an ERE and a BRE
 
  22 #       C       regcomp() error expected, third field is error name
 
  24 #       m       ("mundane") REG_NOSPEC
 
  25 #       s       REG_NOSUB (not really testable)
 
  29 #       #       REG_STARTEND (see below)
 
  32 # For REG_STARTEND, the start/end offsets are those of the substring
 
  41 # parentheses and perversions thereof
 
  51 # gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
 
  54 # end gagging (in a just world, those *should* give EPAREN)
 
  61 # anchoring and REG_NEWLINE
 
  72 # stop retching, those are legitimate (although disgusting)
 
  83 \($\)\(^\)      bn      aNNb    @Nb
 
  95 # certain syntax errors and non-errors
 
 126 # metacharacters, backslashes
 
 138 # trailing $ is a peculiar special case for the BRE code
 
 148 # back references, ugh
 
 151 a\(b*\)c\1d     b       abbcbbd abbcbbd bb
 
 153 a\(b*\)c\1d     b       abbcbbbd
 
 155 a\([bc]\)\1d    b       abcdabbd        abbd    b
 
 156 a\(\([bc]\)\2\)*d       b       abbccd  abbccd
 
 157 a\(\([bc]\)\2\)*d       b       abbcbd
 
 158 # actually, this next one probably ought to fail, but the spec is unclear
 
 159 a\(\(b\)*\2\)*d         b       abbbd   abbbd
 
 160 # here is a case that no NFA implementation does right
 
 161 \(ab*\)[ab]*\1  b       ababaaa ababaaa a
 
 162 # check out normal matching in the presence of back refs
 
 163 \(a\)\1bcd      b       aabcd   aabcd
 
 164 \(a\)\1bc*d     b       aabcd   aabcd
 
 165 \(a\)\1bc*d     b       aabd    aabd
 
 166 \(a\)\1bc*d     b       aabcccd aabcccd
 
 167 \(a\)\1bc*[ce]d b       aabcccd aabcccd
 
 168 ^\(a\)\1b\(c\)*cd$      b       aabcccd aabcccd
 
 170 # ordinary repetitions
 
 181 # the dreaded bounded repetitions
 
 212 ab\{0,0\}c      b       abcac   ac
 
 214 ab\{0,1\}c      b       abcac   abc
 
 215 ab{0,3}c        -       abbcac  abbc
 
 216 ab\{0,3\}c      b       abbcac  abbc
 
 218 ab\{1,1\}c      b       acabc   abc
 
 220 ab\{1,3\}c      b       acabc   abc
 
 221 ab{2,2}c        -       abcabbc abbc
 
 222 ab\{2,2\}c      b       abcabbc abbc
 
 223 ab{2,4}c        -       abcabbc abbc
 
 224 ab\{2,4\}c      b       abcabbc abbc
 
 225 ((a{1,10}){1,10}){1,10} -       a       a       a,a
 
 227 # multiple repetitions
 
 245 a\{1\}\{1\}     bC      BADRPT
 
 249 # brackets, and numerous perversions thereof
 
 264 a[[.-.]--]c     &       a-c     a-c
 
 271 a[[.x,.]]       &C      ECOLLATE
 
 272 a[[.one.]]b     &       a1b     a1b
 
 273 a[[.notdef.]]b  &C      ECOLLATE
 
 275 a[[:alpha:]]c   &       abc     abc
 
 276 a[[:notdef:]]c  &C      ECTYPE
 
 279 a[[:alpha:]     &C      EBRACK
 
 280 a[[:alpha,:]    &C      ECTYPE
 
 283 a[[:alph:]]     &C      ECTYPE
 
 284 a[[:alphabet:]] &C      ECTYPE
 
 285 [[:alnum:]]+    -       -%@a0X- a0X
 
 286 [[:alpha:]]+    -       -%@aX0- aX
 
 287 [[:blank:]]+    -       aSSTb   SST
 
 288 [[:cntrl:]]+    -       aNTb    NT
 
 289 [[:digit:]]+    -       a019b   019
 
 290 [[:graph:]]+    -       Sa%bS   a%b
 
 291 [[:lower:]]+    -       AabC    ab
 
 292 [[:print:]]+    -       NaSbN   aSb
 
 293 [[:punct:]]+    -       S%-&T   %-&
 
 294 [[:space:]]+    -       aSNTb   SNT
 
 295 [[:upper:]]+    -       aBCd    BC
 
 296 [[:xdigit:]]+   -       p0f3Cq  0f3C
 
 302 a[[=b,=]]       &C      ECOLLATE
 
 303 a[[=one=]]b     &       a1b     a1b
 
 309 # just gotta have one DFA-buster, of course
 
 310 a[ab]{20}       -       aaaaabaaaabaaaabaaaab   aaaaabaaaabaaaabaaaab
 
 311 # and an inline expansion in case somebody gets tricky
 
 312 a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab]       -       aaaaabaaaabaaaabaaaab   aaaaabaaaabaaaabaaaab
 
 313 # and in case somebody just slips in an NFA...
 
 314 a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night)      -       aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
 
 315 # fish for anomalies as the number of states passes 32
 
 316 12345678901234567890123456789   -       a12345678901234567890123456789b 12345678901234567890123456789
 
 317 123456789012345678901234567890  -       a123456789012345678901234567890b        123456789012345678901234567890
 
 318 1234567890123456789012345678901 -       a1234567890123456789012345678901b       1234567890123456789012345678901
 
 319 12345678901234567890123456789012        -       a12345678901234567890123456789012b      12345678901234567890123456789012
 
 320 123456789012345678901234567890123       -       a123456789012345678901234567890123b     123456789012345678901234567890123
 
 321 # and one really big one, beyond any plausible word width
 
 322 1234567890123456789012345678901234567890123456789012345678901234567890  -       a1234567890123456789012345678901234567890123456789012345678901234567890b        1234567890123456789012345678901234567890123456789012345678901234567890
 
 323 # fish for problems as brackets go past 8
 
 324 [ab][cd][ef][gh][ij][kl][mn]    -       xacegikmoq      acegikm
 
 325 [ab][cd][ef][gh][ij][kl][mn][op]        -       xacegikmoq      acegikmo
 
 326 [ab][cd][ef][gh][ij][kl][mn][op][qr]    -       xacegikmoqy     acegikmoq
 
 327 [ab][cd][ef][gh][ij][kl][mn][op][q]     -       xacegikmoqy     acegikmoq
 
 329 # subtleties of matching
 
 333 a[Bc]*d         i       abBCcd  abBCcd
 
 334 0[[:upper:]]1   &i      0a1     0a1
 
 335 0[[:lower:]]1   &i      0A1     0A1
 
 341 [abc]b[abc]     -       abc     abc
 
 342 [abc]b[abd]     -       abd     abd
 
 344 (wee|week)(knights|night)       -       weeknights      weeknights
 
 345 (we|wee|week|frob)(knights|night|day)   -       weeknights      weeknights
 
 346 a[bc]d          -       xyzaaabcaababdacd       abd
 
 351 # Let's have some fun -- try to match a C comment.
 
 352 # first the obvious, which looks okay at first glance...
 
 353 /\*.*\*/        -       /*x*/   /*x*/
 
 355 /\*.*\*/        -       /*x*/y/*z*/     /*x*/y/*z*/
 
 356 # okay, we must not match */ inside; try to do that...
 
 357 /\*([^*]|\*[^/])*\*/    -       /*x*/   /*x*/
 
 358 /\*([^*]|\*[^/])*\*/    -       /*x*/y/*z*/     /*x*/
 
 360 /\*([^*]|\*[^/])*\*/    -       /*x**/y/*z*/    /*x**/y/*z*/
 
 361 # and a still fancier version, which does it right (I think)...
 
 362 /\*([^*]|\*+[^*/])*\*+/ -       /*x*/   /*x*/
 
 363 /\*([^*]|\*+[^*/])*\*+/ -       /*x*/y/*z*/     /*x*/
 
 364 /\*([^*]|\*+[^*/])*\*+/ -       /*x**/y/*z*/    /*x**/
 
 365 /\*([^*]|\*+[^*/])*\*+/ -       /*x****/y/*z*/  /*x****/
 
 366 /\*([^*]|\*+[^*/])*\*+/ -       /*x**x*/y/*z*/  /*x**x*/
 
 367 /\*([^*]|\*+[^*/])*\*+/ -       /*x***x/y/*z*/  /*x***x/y/*z*/
 
 371 a(b)(c)d        -       abcd    abcd    b,c
 
 372 a(((b)))c       -       abc     abc     b,b,b
 
 373 a(b|(c))d       -       abd     abd     b,-
 
 374 a(b*|c|e)d      -       abbd    abbd    bb
 
 375 a(b*|c|e)d      -       acd     acd     c
 
 376 a(b*|c|e)d      -       ad      ad      @d
 
 380 a(b+)c          -       abbbc   abbbc   bbb
 
 382 (a|ab)(bc([de]+)f|cde)  -       abcdef  abcdef  a,bcdef,de
 
 383 # the regression tester only asks for 9 subexpressions
 
 384 a(b)(c)(d)(e)(f)(g)(h)(i)(j)k   -       abcdefghijk     abcdefghijk     b,c,d,e,f,g,h,i,j
 
 385 a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l        -       abcdefghijkl    abcdefghijkl    b,c,d,e,f,g,h,i,j,k
 
 386 a([bc]?)c       -       abc     abc     b
 
 388 a([bc]+)c       -       abc     abc     b
 
 389 a([bc]+)c       -       abcc    abcc    bc
 
 390 a([bc]+)bc      -       abcbc   abcbc   bc
 
 391 a(bb+|b)b       -       abb     abb     b
 
 392 a(bbb+|bb+|b)b  -       abb     abb     b
 
 393 a(bbb+|bb+|b)b  -       abbb    abbb    bb
 
 394 a(bbb+|bb+|b)bb -       abbb    abbb    b
 
 395 (.*).*          -       abcdef  abcdef  abcdef
 
 398 # do we get the right subexpression when it is used more than once?
 
 400 a(b|c)*d        -       abcd    abcd    c
 
 402 a(b|c)+d        -       abcd    abcd    c
 
 404 a(b|c?)+d       -       abcd    abcd    @d
 
 405 a(b|c){0,0}d    -       ad      ad      -
 
 406 a(b|c){0,1}d    -       ad      ad      -
 
 407 a(b|c){0,1}d    -       abd     abd     b
 
 408 a(b|c){0,2}d    -       ad      ad      -
 
 409 a(b|c){0,2}d    -       abcd    abcd    c
 
 410 a(b|c){0,}d     -       ad      ad      -
 
 411 a(b|c){0,}d     -       abcd    abcd    c
 
 412 a(b|c){1,1}d    -       abd     abd     b
 
 413 a(b|c){1,1}d    -       acd     acd     c
 
 414 a(b|c){1,2}d    -       abd     abd     b
 
 415 a(b|c){1,2}d    -       abcd    abcd    c
 
 416 a(b|c){1,}d     -       abd     abd     b
 
 417 a(b|c){1,}d     -       abcd    abcd    c
 
 418 a(b|c){2,2}d    -       acbd    acbd    b
 
 419 a(b|c){2,2}d    -       abcd    abcd    c
 
 420 a(b|c){2,4}d    -       abcd    abcd    c
 
 421 a(b|c){2,4}d    -       abcbd   abcbd   b
 
 422 a(b|c){2,4}d    -       abcbcd  abcbcd  c
 
 423 a(b|c){2,}d     -       abcd    abcd    c
 
 424 a(b|c){2,}d     -       abcbd   abcbd   b
 
 425 a(b+|((c)*))+d  -       abd     abd     @d,@d,-
 
 426 a(b+|((c)*))+d  -       abcd    abcd    @d,@d,-
 
 428 # check out the STARTEND option
 
 438 # plain strings, with the NOSPEC flag
 
 446 # cases involving NULs
 
 454 # word boundaries (ick)
 
 461 [[:<:]]a.c[[:>:]]       &       axcd-dayc-dazce-abc     abc
 
 462 [[:<:]]a.c[[:>:]]       &       axcd-dayc-dazce-abc-q   abc
 
 463 [[:<:]]a.c[[:>:]]       &       axc-dayc-dazce-abc      axc
 
 464 [[:<:]]b.c[[:>:]]       &       a_bxc-byc_d-bzc-q       bzc
 
 465 [[:<:]].x..[[:>:]]      &       y_xa_-_xb_y-_xc_-axdc   _xc_
 
 466 [[:<:]]a_b[[:>:]]       &       x_a_b
 
 468 # past problems, and suspected problems
 
 469 (A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A])   -       A1      A1
 
 470 abcdefghijklmnop        i       abcdefghijklmnop        abcdefghijklmnop
 
 471 abcdefghijklmnopqrstuv  i       abcdefghijklmnopqrstuv  abcdefghijklmnopqrstuv
 
 472 (ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN])     -       CC11    CC11
 
 473 CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a    -       CC11    CC11
 
 474 Char \([a-z0-9_]*\)\[.* b       Char xyz[k      Char xyz[k      xyz
 
 476 -\{0,1\}[0-9]*$ b       -5      -5
 
 477 a*a*a*a*a*a*a*  &       aaaaaa  aaaaaa