docs/doxygen/overviews/resyntax.h

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        resyntax.h
   3 // Purpose:     topic overview
   4 // Author:      wxWidgets team
   5 // RCS-ID:      $Id$
   6 // Licence:     wxWindows licence
   7 /////////////////////////////////////////////////////////////////////////////
   8
   9 /**
  10
  11 @page overview_resyntax Regular Expressions
  12
  13 A <em>regular expression</em> describes strings of characters. It's a  pattern
  14 that matches certain strings and doesn't match others.
  15
  16 @li @ref overview_resyntax_differentflavors
  17 @li @ref overview_resyntax_syntax
  18 @li @ref overview_resyntax_bracket
  19 @li @ref overview_resyntax_escapes
  20 @li @ref overview_resyntax_metasyntax
  21 @li @ref overview_resyntax_matching
  22 @li @ref overview_resyntax_limits
  23 @li @ref overview_resyntax_bre
  24 @li @ref overview_resyntax_characters
  25
  26 @see
  27
  28 @li wxRegEx
  29
  30
  31 <hr>
  32
  33
  34 @section overview_resyntax_differentflavors Different Flavors of Regular Expressions
  35
  36 Regular expressions (RE), as defined by POSIX, come in two flavors:
  37 <em>extended regular expressions</em> (ERE) and <em>basic regular
  38 expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
  39 while BREs are roughly those of the traditional @e ed. This implementation
  40 adds a third flavor: <em>advanced regular expressions</em> (ARE), basically
  41 EREs with some significant extensions.
  42
  43 This manual page primarily describes AREs. BREs mostly exist for backward
  44 compatibility in some old programs. POSIX EREs are almost an exact subset of
  45 AREs. Features of AREs that are not present in EREs will be indicated.
  46
  47
  48 @section overview_resyntax_syntax Regular Expression Syntax
  49
  50 These regular expressions are implemented using the package written by Henry
  51 Spencer, based on the 1003.2 spec and some (not quite all) of the Perl5
  52 extensions (thanks, Henry!).  Much of the description of regular expressions
  53 below is copied verbatim from his manual entry.
  54
  55 An ARE is one or more @e branches, separated by "|", matching anything that
  56 matches any of the branches.
  57
  58 A branch is zero or more @e constraints or @e quantified atoms, concatenated.
  59 It matches a match for the first, followed by a match for the second, etc; an
  60 empty branch matches the empty string.
  61
  62 A quantified atom is an @e atom possibly followed by a single @e quantifier.
  63 Without a quantifier, it matches a match for the atom. The quantifiers, and
  64 what a so-quantified atom matches, are:
  65
  66 @beginTable
  67 @row2col{ <tt>*</tt> ,
  68     A sequence of 0 or more matches of the atom. }
  69 @row2col{ <tt>+</tt> ,
  70     A sequence of 1 or more matches of the atom. }
  71 @row2col{ <tt>?</tt> ,
  72     A sequence of 0 or 1 matches of the atom. }
  73 @row2col{ <tt>{m}</tt> ,
  74     A sequence of exactly @e m matches of the atom. }
  75 @row2col{ <tt>{m\,}</tt> ,
  76     A sequence of @e m or more matches of the atom. }
  77 @row2col{ <tt>{m\,n}</tt> ,
  78     A sequence of @e m through @e n (inclusive) matches of the atom; @e m may
  79     not exceed @e n. }
  80 @row2col{ <tt>*? +? ?? {m}? {m\,}? {m\,n}?</tt> ,
  81     @e Non-greedy quantifiers, which match the same possibilities, but prefer
  82     the smallest number rather than the largest number of matches (see
  83     @ref overview_resyntax_matching). }
  84 @endTable
  85
  86 The forms using @b { and @b } are known as @e bounds. The numbers @e m and
  87 @e n are unsigned decimal integers with permissible values from 0 to 255
  88 inclusive. An atom is one of:
  89
  90 @beginTable
  91 @row2col{ <tt>(re)</tt> ,
  92     Where @e re is any regular expression, matches for @e re, with the match
  93     captured for possible reporting. }
  94 @row2col{ <tt>(?:re)</tt> ,
  95     As previous, but does no reporting (a "non-capturing" set of
  96     parentheses). }
  97 @row2col{ <tt>()</tt> ,
  98     Matches an empty string, captured for possible reporting. }
  99 @row2col{ <tt>(?:)</tt> ,
 100     Matches an empty string, without reporting. }
 101 @row2col{ <tt>[chars]</tt> ,
 102     A <em>bracket expression</em>, matching any one of the @e chars (see
 103     @ref overview_resyntax_bracket for more details). }
 104 @row2col{ <tt>.</tt> ,
 105     Matches any single character. }
 106 @row2col{ <tt>@\k</tt> ,
 107     Where @e k is a non-alphanumeric character, matches that character taken
 108     as an ordinary character, e.g. @\@\ matches a backslash character. }
 109 @row2col{ <tt>@\c</tt> ,
 110     Where @e c is alphanumeric (possibly followed by other characters), an
 111     @e escape (AREs only), see @ref overview_resyntax_escapes below. }
 112 @row2col{ <tt>@leftCurly</tt> ,
 113     When followed by a character other than a digit, matches the left-brace
 114     character "@leftCurly"; when followed by a digit, it is the beginning of a
 115     @e bound (see above). }
 116 @row2col{ <tt>x</tt> ,
 117     Where @e x is a single character with no other significance, matches that
 118     character. }
 119 @endTable
 120
 121 A @e constraint matches an empty string when specific conditions are met. A
 122 constraint may not be followed by a quantifier. The simple constraints are as
 123 follows; some more constraints are described later, under
 124 @ref overview_resyntax_escapes.
 125
 126 @beginTable
 127 @row2col{ <tt>^</tt> ,
 128     Matches at the beginning of a line. }
 129 @row2col{ <tt>@$</tt> ,
 130     Matches at the end of a line. }
 131 @row2col{ <tt>(?=re)</tt> ,
 132     @e Positive lookahead (AREs only), matches at any point where a substring
 133     matching @e re begins. }
 134 @row2col{ <tt>(?!re)</tt> ,
 135     @e Negative lookahead (AREs only), matches at any point where no substring
 136     matching @e re begins. }
 137 @endTable
 138
 139 The lookahead constraints may not contain back references (see later), and all
 140 parentheses within them are considered non-capturing. A RE may not end with
 141 "\".
 142
 143
 144 @section overview_resyntax_bracket Bracket Expressions
 145
 146 A <em>bracket expression</em> is a list of characters enclosed in <tt>[]</tt>.
 147 It normally matches any single character from the list (but see below). If the
 148 list begins with @c ^, it matches any single character (but see below) @e not
 149 from the rest of the list.
 150
 151 If two characters in the list are separated by <tt>-</tt>, this is shorthand
 152 for the full @e range of characters between those two (inclusive) in the
 153 collating sequence, e.g. <tt>[0-9]</tt> in ASCII matches any decimal digit.
 154 Two ranges may not share an endpoint, so e.g. <tt>a-c-e</tt> is illegal.
 155 Ranges are very collating-sequence-dependent, and portable programs should
 156 avoid relying on them.
 157
 158 To include a literal <tt>]</tt> or <tt>-</tt> in the list, the simplest method
 159 is to enclose it in <tt>[.</tt> and <tt>.]</tt> to make it a collating element
 160 (see below). Alternatively, make it the first character (following a possible
 161 <tt>^</tt>), or (AREs only) precede it with <tt>@\</tt>. Alternatively, for
 162 <tt>-</tt>, make it the last character, or the second endpoint of a range. To
 163 use a literal <tt>-</tt> as the first endpoint of a range, make it a collating
 164 element or (AREs only) precede it with <tt>@\</tt>. With the exception of
 165 these, some combinations using <tt>[</tt> (see next paragraphs), and escapes,
 166 all other special characters lose their special significance within a bracket
 167 expression.
 168
 169 Within a bracket expression, a collating element (a character, a
 170 multi-character sequence that collates as if it were a single character, or a
 171 collating-sequence name for either) enclosed in <tt>[.</tt> and <tt>.]</tt>
 172 stands for the sequence of characters of that collating element.
 173
 174 @e wxWidgets: Currently no multi-character collating elements are defined. So
 175 in <tt>[.X.]</tt>, @c X can either be a single character literal or the name
 176 of a character. For example, the following are both identical:
 177 <tt>[[.0.]-[.9.]]</tt> and <tt>[[.zero.]-[.nine.]]</tt> and mean the same as
 178 <tt>[0-9]</tt>. See @ref overview_resyntax_characters.
 179
 180 Within a bracket expression, a collating element enclosed in <tt>[=</tt> and
 181 <tt>=]</tt> is an equivalence class, standing for the sequences of characters
 182 of all collating elements equivalent to that one, including itself. An
 183 equivalence class may not be an endpoint of a range.
 184
 185 @e wxWidgets: Currently no equivalence classes are defined, so <tt>[=X=]</tt>
 186 stands for just the single character @c X. @c X can either be a single
 187 character literal or the name of a character, see
 188 @ref overview_resyntax_characters.
 189
 190 Within a bracket expression, the name of a @e character class enclosed in
 191 <tt>[:</tt> and <tt>:]</tt> stands for the list of all characters (not all
 192 collating elements!) belonging to that class. Standard character classes are:
 193
 194 @beginTable
 195 @row2col{ <tt>alpha</tt>  , A letter. }
 196 @row2col{ <tt>upper</tt>  , An upper-case letter. }
 197 @row2col{ <tt>lower</tt>  , A lower-case letter. }
 198 @row2col{ <tt>digit</tt>  , A decimal digit. }
 199 @row2col{ <tt>xdigit</tt> , A hexadecimal digit. }
 200 @row2col{ <tt>alnum</tt>  , An alphanumeric (letter or digit). }
 201 @row2col{ <tt>print</tt>  , An alphanumeric (same as alnum). }
 202 @row2col{ <tt>blank</tt>  , A space or tab character. }
 203 @row2col{ <tt>space</tt>  , A character producing white space in displayed text. }
 204 @row2col{ <tt>punct</tt>  , A punctuation character. }
 205 @row2col{ <tt>graph</tt>  , A character with a visible representation. }
 206 @row2col{ <tt>cntrl</tt>  , A control character. }
 207 @endTable
 208
 209 A character class may not be used as an endpoint of a range.
 210
 211 @e wxWidgets: In a non-Unicode build, these character classifications depend on
 212 the current locale, and correspond to the values return by the ANSI C "is"
 213 functions: <tt>isalpha</tt>, <tt>isupper</tt>, etc. In Unicode mode they are
 214 based on Unicode classifications, and are not affected by the current locale.
 215
 216 There are two special cases of bracket expressions: the bracket expressions
 217 <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> are constraints, matching empty strings at
 218 the beginning and end of a word respectively.  A word is defined as a sequence
 219 of word characters that is neither preceded nor followed by word characters. A
 220 word character is an @e alnum character or an underscore (_). These special
 221 bracket expressions are deprecated; users of AREs should use constraint escapes
 222 instead (see escapes below).
 223
 224
 225 @section overview_resyntax_escapes Escapes
 226
 227 Escapes (AREs only), which begin with a <tt>@\</tt> followed by an alphanumeric
 228 character, come in several varieties: character entry, class shorthands,
 229 constraint escapes, and back references. A <tt>@\</tt> followed by an
 230 alphanumeric character but not constituting a valid escape is illegal in AREs.
 231 In EREs, there are no escapes: outside a bracket expression, a <tt>@\</tt>
 232 followed by an alphanumeric character merely stands for that character as an
 233 ordinary character, and inside a bracket expression, <tt>@\</tt> is an ordinary
 234 character. (The latter is the one actual incompatibility between EREs and
 235 AREs.)
 236
 237 Character-entry escapes (AREs only) exist to make it easier to specify
 238 non-printing and otherwise inconvenient characters in REs:
 239
 240 @beginTable
 241 @row2col{ <tt>@\a</tt> , Alert (bell) character, as in C. }
 242 @row2col{ <tt>@\b</tt> , Backspace, as in C. }
 243 @row2col{ <tt>@\B</tt> ,
 244     Synonym for <tt>@\</tt> to help reduce backslash doubling in some
 245     applications where there are multiple levels of backslash processing. }
 246 @row2col{ <tt>@\cX</tt> ,
 247     The character whose low-order 5 bits are the same as those of @e X, and
 248     whose other bits are all zero, where @e X is any character. }
 249 @row2col{ <tt>@\e</tt> ,
 250     The character whose collating-sequence name is @c ESC, or failing that,
 251     the character with octal value 033. }
 252 @row2col{ <tt>@\f</tt> , Formfeed, as in C. }
 253 @row2col{ <tt>@\n</tt> , Newline, as in C. }
 254 @row2col{ <tt>@\r</tt> , Carriage return, as in C. }
 255 @row2col{ <tt>@\t</tt> , Horizontal tab, as in C. }
 256 @row2col{ <tt>@\uwxyz</tt> ,
 257     The Unicode character <tt>U+wxyz</tt> in the local byte ordering, where
 258     @e wxyz is exactly four hexadecimal digits. }
 259 @row2col{ <tt>@\Ustuvwxyz</tt> ,
 260     Reserved for a somewhat-hypothetical Unicode extension to 32 bits, where
 261     @e stuvwxyz is exactly eight hexadecimal digits. }
 262 @row2col{ <tt>@\v</tt> , Vertical tab, as in C are all available. }
 263 @row2col{ <tt>@\xhhh</tt> ,
 264     The single character whose hexadecimal value is @e 0xhhh, where @e hhh is
 265     any sequence of hexadecimal digits. }
 266 @row2col{ <tt>@\0</tt> , The character whose value is 0. }
 267 @row2col{ <tt>@\xy</tt> ,
 268     The character whose octal value is @e 0xy, where @e xy is exactly two octal
 269     digits, and is not a <em>back reference</em> (see below). }
 270 @row2col{ <tt>@\xyz</tt> ,
 271     The character whose octal value is @e 0xyz, where @e xyz is exactly three
 272     octal digits, and is not a <em>back reference</em> (see below). }
 273 @endTable
 274
 275 Hexadecimal digits are 0-9, a-f, and A-F. Octal digits are 0-7.
 276
 277 The character-entry escapes are always taken as ordinary characters. For
 278 example, <tt>@\135</tt> is <tt>]</tt> in ASCII, but <tt>@\135</tt> does not
 279 terminate a bracket expression. Beware, however, that some applications (e.g.,
 280 C compilers) interpret  such sequences themselves before the regular-expression
 281 package gets to see them, which may require doubling (quadrupling, etc.) the
 282 '<tt>@\</tt>'.
 283
 284 Class-shorthand escapes (AREs only) provide shorthands for certain
 285 commonly-used character classes:
 286
 287 @beginTable
 288 @row2col{ <tt>@\d</tt> , <tt>[[:digit:]]</tt> }
 289 @row2col{ <tt>@\s</tt> , <tt>[[:space:]]</tt> }
 290 @row2col{ <tt>@\w</tt> , <tt>[[:alnum:]_]</tt> (note underscore) }
 291 @row2col{ <tt>@\D</tt> , <tt>[^[:digit:]]</tt> }
 292 @row2col{ <tt>@\S</tt> , <tt>[^[:space:]]</tt> }
 293 @row2col{ <tt>@\W</tt> , <tt>[^[:alnum:]_]</tt> (note underscore) }
 294 @endTable
 295
 296 Within bracket expressions, <tt>@\d</tt>, <tt>@\s</tt>, and <tt>@\w</tt> lose
 297 their outer brackets, and <tt>@\D</tt>, <tt>@\S</tt>, <tt>@\W</tt> are illegal.
 298 So, for example, <tt>[a-c@\d]</tt> is equivalent to <tt>[a-c[:digit:]]</tt>.
 299 Also, <tt>[a-c@\D]</tt>, which is equivalent to <tt>[a-c^[:digit:]]</tt>, is
 300 illegal.
 301
 302 A constraint escape (AREs only) is a constraint, matching the empty string if
 303 specific conditions are met, written as an escape:
 304
 305 @beginTable
 306 @row2col{ <tt>@\A</tt> , Matches only at the beginning of the string, see
 307                          @ref overview_resyntax_matching for how this differs
 308                          from <tt>^</tt>. }
 309 @row2col{ <tt>@\m</tt> , Matches only at the beginning of a word. }
 310 @row2col{ <tt>@\M</tt> , Matches only at the end of a word. }
 311 @row2col{ <tt>@\y</tt> , Matches only at the beginning or end of a word. }
 312 @row2col{ <tt>@\Y</tt> , Matches only at a point that is not the beginning or
 313                          end of a word. }
 314 @row2col{ <tt>@\Z</tt> , Matches only at the end of the string, see
 315                          @ref overview_resyntax_matching for how this differs
 316                          from <tt>@$</tt>. }
 317 @row2col{ <tt>@\m</tt> , A <em>back reference</em>, where @e m is a non-zero
 318                          digit. See below. }
 319 @row2col{ <tt>@\mnn</tt> ,
 320     A <em>back reference</em>, where @e m is a nonzero digit, and @e nn is some
 321     more digits, and the decimal value @e mnn is not greater than the number of
 322     closing capturing parentheses seen so far. See below. }
 323 @endTable
 324
 325 A word is defined as in the specification of <tt>[[:@<:]]</tt> and
 326 <tt>[[:@>:]]</tt> above. Constraint escapes are illegal within bracket
 327 expressions.
 328
 329 A back reference (AREs only) matches the same string matched by the
 330 parenthesized subexpression specified by the number. For example, "([bc])\1"
 331 matches "bb" or "cc" but not "bc". The subexpression must entirely precede the
 332 back reference in the RE.Subexpressions are numbered in the order of their
 333 leading parentheses. Non-capturing parentheses do not define subexpressions.
 334
 335 There is an inherent historical ambiguity between octal character-entry escapes
 336 and back references, which is resolved by heuristics, as hinted at above. A
 337 leading zero always indicates an octal escape. A single non-zero digit, not
 338 followed by another digit, is always taken as a back reference. A multi-digit
 339 sequence not starting with a zero is taken as a back  reference if it comes
 340 after a suitable subexpression (i.e. the number is in the legal range for a
 341 back reference), and otherwise is taken as octal.
 342
 343
 344 @section overview_resyntax_metasyntax Metasyntax
 345
 346 In addition to the main syntax described above, there are some special forms
 347 and miscellaneous syntactic facilities available.
 348
 349 Normally the flavor of RE being used is specified by application-dependent
 350 means. However, this can be overridden by a @e director. If an RE of any flavor
 351 begins with <tt>***:</tt>, the rest of the RE is an ARE. If an RE of any
 352 flavor begins with <tt>***=</tt>, the rest of the RE is taken to be a literal
 353 string, with all characters considered ordinary characters.
 354
 355 An ARE may begin with <em>embedded options</em>: a sequence <tt>(?xyz)</tt>
 356 (where @e xyz is one or more alphabetic characters) specifies options affecting
 357 the rest of the RE. These supplement, and can override, any options specified
 358 by the application. The available option letters are:
 359
 360 @beginTable
 361 @row2col{ <tt>b</tt> , Rest of RE is a BRE. }
 362 @row2col{ <tt>c</tt> , Case-sensitive matching (usual default). }
 363 @row2col{ <tt>e</tt> , Rest of RE is an ERE. }
 364 @row2col{ <tt>i</tt> , Case-insensitive matching (see
 365                        @ref overview_resyntax_matching, below). }
 366 @row2col{ <tt>m</tt> , Historical synonym for @e n. }
 367 @row2col{ <tt>n</tt> , Newline-sensitive matching (see
 368                        @ref overview_resyntax_matching, below). }
 369 @row2col{ <tt>p</tt> , Partial newline-sensitive matching (see
 370                        @ref overview_resyntax_matching, below). }
 371 @row2col{ <tt>q</tt> , Rest of RE is a literal ("quoted") string, all ordinary
 372                        characters. }
 373 @row2col{ <tt>s</tt> , Non-newline-sensitive matching (usual default). }
 374 @row2col{ <tt>t</tt> , Tight syntax (usual default; see below). }
 375 @row2col{ <tt>w</tt> , Inverse partial newline-sensitive ("weird") matching
 376                        (see @ref overview_resyntax_matching, below). }
 377 @row2col{ <tt>x</tt> , Expanded syntax (see below). }
 378 @endTable
 379
 380 Embedded options take effect at the <tt>)</tt> terminating the sequence. They
 381 are available only at the start of an ARE, and may not be used later within it.
 382
 383 In addition to the usual (@e tight) RE syntax, in which all characters are
 384 significant, there is an @e expanded syntax, available in AREs with the
 385 embedded x option. In the expanded syntax, white-space characters are ignored
 386 and all characters between a <tt>@#</tt> and the following newline (or the end
 387 of the RE) are ignored, permitting paragraphing and commenting a complex RE.
 388 There are three exceptions to that basic rule:
 389
 390 @li A white-space character or <tt>@#</tt> preceded by <tt>@\</tt> is retained.
 391 @li White space or <tt>@#</tt> within a bracket expression is retained.
 392 @li White space and comments are illegal within multi-character symbols like
 393     the ARE <tt>(?:</tt> or the BRE <tt>\(</tt>.
 394
 395 Expanded-syntax white-space characters are blank, tab, newline, and any
 396 character that belongs to the @e space character class.
 397
 398 Finally, in an ARE, outside bracket expressions, the sequence <tt>(?@#ttt)</tt>
 399 (where @e ttt is any text not containing a <tt>)</tt>) is a comment, completely
 400 ignored. Again, this is not allowed between the characters of multi-character
 401 symbols like <tt>(?:</tt>. Such comments are more a historical artifact than a
 402 useful facility, and their use is deprecated; use the expanded syntax instead.
 403
 404 @e None of these metasyntax extensions is available if the application (or an
 405 initial <tt>***=</tt> director) has specified that the user's input be treated
 406 as a literal string rather than as an RE.
 407
 408
 409 @section overview_resyntax_matching Matching
 410
 411 In the event that an RE could match more than one substring of a given string,
 412 the RE matches the one starting earliest in the string. If the RE could match
 413 more than one substring starting at that point, the choice is determined by
 414 it's @e preference: either the longest substring, or the shortest.
 415
 416 Most atoms, and all constraints, have no preference. A parenthesized RE has the
 417 same preference (possibly none) as the RE. A quantified atom with quantifier
 418 <tt>{m}</tt> or <tt>{m}?</tt> has the same preference (possibly none) as the
 419 atom itself. A quantified atom with other normal quantifiers (including
 420 <tt>{m,n}</tt> with @e m equal to @e n) prefers longest match. A quantified
 421 atom with other non-greedy quantifiers (including <tt>{m,n}?</tt> with @e m
 422 equal to @e n) prefers shortest match. A branch has the same preference as the
 423 first quantified atom in it which has a preference. An RE consisting of two or
 424 more branches connected by the @c | operator prefers longest match.
 425
 426 Subject to the constraints imposed by the rules for matching the whole RE,
 427 subexpressions also match the longest or shortest possible substrings, based on
 428 their preferences, with subexpressions starting earlier in the RE taking
 429 priority over ones starting later. Note that outer subexpressions thus take
 430 priority over their component subexpressions.
 431
 432 Note that the quantifiers <tt>{1,1}</tt> and <tt>{1,1}?</tt> can be used to
 433 force longest and shortest preference, respectively, on a subexpression or a
 434 whole RE.
 435
 436 Match lengths are measured in characters, not collating elements. An empty
 437 string is considered longer than no match at all. For example, <tt>bb*</tt>
 438 matches the three middle characters of "abbbc",
 439 <tt>(week|wee)(night|knights)</tt> matches all ten characters of "weeknights",
 440 when <tt>(.*).*</tt> is matched against "abc" the parenthesized subexpression
 441 matches all three characters, and when <tt>(a*)*</tt> is matched against "bc"
 442 both the whole RE and the parenthesized subexpression match an empty string.
 443
 444 If case-independent matching is specified, the effect is much as if all case
 445 distinctions had vanished from the alphabet. When an alphabetic that exists in
 446 multiple cases appears as an ordinary character outside a bracket expression,
 447 it is effectively transformed into a bracket expression containing both cases,
 448 so that @c x becomes @c [xX]. When it appears inside a bracket expression, all
 449 case counterparts of it are added to the bracket expression, so that @c [x]
 450 becomes @c [xX] and @c [^x] becomes @c [^xX].
 451
 452 If newline-sensitive matching is specified, "." and bracket expressions using
 453 "^" will never match the newline character (so that matches will never cross
 454 newlines unless the RE explicitly arranges it) and "^" and "$" will match the
 455 empty string after and before a newline respectively, in addition to matching
 456 at beginning and end of string respectively. ARE <tt>@\A</tt> and <tt>@\Z</tt>
 457 continue to match beginning or end of string @e only.
 458
 459 If partial newline-sensitive matching is specified, this affects "." and
 460 bracket expressions as with newline-sensitive matching, but not "^" and "$".
 461
 462 If inverse partial newline-sensitive matching is specified, this affects "^"
 463 and "$" as with newline-sensitive matching, but not "." and bracket
 464 expressions. This isn't very useful but is provided for symmetry.
 465
 466
 467 @section overview_resyntax_limits Limits and Compatibility
 468
 469 No particular limit is imposed on the length of REs. Programs intended to be
 470 highly portable should not employ REs longer than 256 bytes, as a
 471 POSIX-compliant implementation can refuse to accept such REs.
 472
 473 The only feature of AREs that is actually incompatible with POSIX EREs is that
 474 <tt>@\</tt> does not lose its special significance inside bracket expressions.
 475 All other ARE features use syntax which is illegal or has undefined or
 476 unspecified effects in POSIX EREs; the <tt>***</tt> syntax of directors
 477 likewise is outside the POSIX syntax for both BREs and EREs.
 478
 479 Many of the ARE extensions are borrowed from Perl, but some have been changed
 480 to clean them up, and a few Perl extensions are not present. Incompatibilities
 481 of note include <tt>@\b</tt>, <tt>@\B</tt>, the lack of special treatment for a
 482 trailing newline, the addition of complemented bracket expressions to the
 483 things affected by newline-sensitive matching, the restrictions on parentheses
 484 and back references in lookahead constraints, and the longest/shortest-match
 485 (rather than first-match) matching semantics.
 486
 487 The matching rules for REs containing both normal and non-greedy quantifiers
 488 have changed since early beta-test versions of this package. The new rules are
 489 much simpler and cleaner, but don't work as hard at guessing the user's real
 490 intentions.
 491
 492 Henry Spencer's original 1986 @e regexp package, still in widespread use,
 493 implemented an early version of today's EREs. There are four incompatibilities
 494 between @e regexp's near-EREs (RREs for short) and AREs. In roughly increasing
 495 order of significance:
 496
 497 @li In AREs, <tt>@\</tt> followed by an alphanumeric character is either an
 498     escape or an error, while in RREs, it was just another way of writing the
 499     alphanumeric. This should not be a problem because there was no reason to
 500     write such a sequence in RREs.
 501 @li @c { followed by a digit in an ARE is the beginning of a bound, while in
 502     RREs, @c { was always an ordinary character. Such sequences should be rare,
 503     and will often result in an error because following characters will not
 504     look like a valid bound.
 505 @li In AREs, @c @\ remains a special character within @c [], so a literal @c @\
 506     within @c [] must be written as <tt>@\@\</tt>. <tt>@\@\</tt> also gives a
 507     literal @c @\ within @c [] in RREs, but only truly paranoid programmers
 508     routinely doubled the backslash.
 509 @li AREs report the longest/shortest match for the RE, rather than the first
 510     found in a specified search order. This may affect some RREs which were
 511     written in the expectation that the first match would be reported. The
 512     careful crafting of RREs to optimize the search order for fast matching is
 513     obsolete (AREs examine all possible matches in parallel, and their
 514     performance is largely insensitive to their complexity) but cases where the
 515     search order was exploited to deliberately find a match which was @e not
 516     the longest/shortest will need rewriting.
 517
 518
 519 @section overview_resyntax_bre Basic Regular Expressions
 520
 521 BREs differ from EREs in several respects. @c |, @c +, and @c ? are ordinary
 522 characters and there is no equivalent for their functionality. The delimiters
 523 for bounds are @c @\{ and @c @\}, with @c { and @c } by themselves ordinary
 524 characters. The parentheses for nested subexpressions are @c @\( and @c @\),
 525 with @c ( and @c ) by themselves ordinary characters. @c ^ is an ordinary
 526 character except at the beginning of the RE or the beginning of a parenthesized
 527 subexpression, @c $ is an ordinary character except at the end of the RE or the
 528 end of a parenthesized subexpression, and @c * is an ordinary character if it
 529 appears at the beginning of the RE or the beginning of a parenthesized
 530 subexpression (after a possible leading <tt>^</tt>). Finally, single-digit back
 531 references are available, and @c @\@< and @c @\@> are synonyms for
 532 <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> respectively; no other escapes are
 533 available.
 534
 535
 536 @section overview_resyntax_characters Regular Expression Character Names
 537
 538 Note that the character names are case sensitive.
 539
 540 <center><table class='doctable' border='0' cellspacing='5' cellpadding='4'><tr>
 541
 542 <td>
 543 @beginTable
 544 @row2col{ <tt>NUL</tt> , @\0 }
 545 @row2col{ <tt>SOH</tt> , @\001 }
 546 @row2col{ <tt>STX</tt> , @\002 }
 547 @row2col{ <tt>ETX</tt> , @\003 }
 548 @row2col{ <tt>EOT</tt> , @\004 }
 549 @row2col{ <tt>ENQ</tt> , @\005 }
 550 @row2col{ <tt>ACK</tt> , @\006 }
 551 @row2col{ <tt>BEL</tt> , @\007 }
 552 @row2col{ <tt>alert</tt> , @\007 }
 553 @row2col{ <tt>BS</tt> , @\010 }
 554 @row2col{ <tt>backspace</tt> , @\b }
 555 @row2col{ <tt>HT</tt> , @\011 }
 556 @row2col{ <tt>tab</tt> , @\t }
 557 @row2col{ <tt>LF</tt> , @\012 }
 558 @row2col{ <tt>newline</tt> , @\n }
 559 @row2col{ <tt>VT</tt> , @\013 }
 560 @row2col{ <tt>vertical-tab</tt> , @\v }
 561 @row2col{ <tt>FF</tt> , @\014 }
 562 @row2col{ <tt>form-feed</tt> , @\f }
 563 @endTable
 564 </td>
 565
 566 <td>
 567 @beginTable
 568 @row2col{ <tt>CR</tt> , @\015 }
 569 @row2col{ <tt>carriage-return</tt> , @\r }
 570 @row2col{ <tt>SO</tt> , @\016 }
 571 @row2col{ <tt>SI</tt> , @\017 }
 572 @row2col{ <tt>DLE</tt> , @\020 }
 573 @row2col{ <tt>DC1</tt> , @\021 }
 574 @row2col{ <tt>DC2</tt> , @\022 }
 575 @row2col{ <tt>DC3</tt> , @\023 }
 576 @row2col{ <tt>DC4</tt> , @\024 }
 577 @row2col{ <tt>NAK</tt> , @\025 }
 578 @row2col{ <tt>SYN</tt> , @\026 }
 579 @row2col{ <tt>ETB</tt> , @\027 }
 580 @row2col{ <tt>CAN</tt> , @\030 }
 581 @row2col{ <tt>EM</tt> , @\031 }
 582 @row2col{ <tt>SUB</tt> , @\032 }
 583 @row2col{ <tt>ESC</tt> , @\033 }
 584 @row2col{ <tt>IS4</tt> , @\034 }
 585 @row2col{ <tt>FS</tt> , @\034 }
 586 @row2col{ <tt>IS3</tt> , @\035 }
 587 @endTable
 588 </td>
 589
 590 <td>
 591 @beginTable
 592 @row2col{ <tt>GS</tt> , @\035 }
 593 @row2col{ <tt>IS2</tt> , @\036 }
 594 @row2col{ <tt>RS</tt> , @\036 }
 595 @row2col{ <tt>IS1</tt> , @\037 }
 596 @row2col{ <tt>US</tt> , @\037 }
 597 @row2col{ <tt>space</tt> , " " (space) }
 598 @row2col{ <tt>exclamation-mark</tt> , ! }
 599 @row2col{ <tt>quotation-mark</tt> , " }
 600 @row2col{ <tt>number-sign</tt> , @# }
 601 @row2col{ <tt>dollar-sign</tt> , @$ }
 602 @row2col{ <tt>percent-sign</tt> , @% }
 603 @row2col{ <tt>ampersand</tt> , @& }
 604 @row2col{ <tt>apostrophe</tt> , ' }
 605 @row2col{ <tt>left-parenthesis</tt> , ( }
 606 @row2col{ <tt>right-parenthesis</tt> , ) }
 607 @row2col{ <tt>asterisk</tt> , * }
 608 @row2col{ <tt>plus-sign</tt> , + }
 609 @row2col{ <tt>comma</tt> , \, }
 610 @row2col{ <tt>hyphen</tt> , - }
 611 @endTable
 612 </td>
 613
 614 <td>
 615 @beginTable
 616 @row2col{ <tt>hyphen-minus</tt> , - }
 617 @row2col{ <tt>period</tt> , . }
 618 @row2col{ <tt>full-stop</tt> , . }
 619 @row2col{ <tt>slash</tt> , / }
 620 @row2col{ <tt>solidus</tt> , / }
 621 @row2col{ <tt>zero</tt> , 0 }
 622 @row2col{ <tt>one</tt> , 1 }
 623 @row2col{ <tt>two</tt> , 2 }
 624 @row2col{ <tt>three</tt> , 3 }
 625 @row2col{ <tt>four</tt> , 4 }
 626 @row2col{ <tt>five</tt> , 5 }
 627 @row2col{ <tt>six</tt> , 6 }
 628 @row2col{ <tt>seven</tt> , 7 }
 629 @row2col{ <tt>eight</tt> , 8 }
 630 @row2col{ <tt>nine</tt> , 9 }
 631 @row2col{ <tt>colon</tt> , : }
 632 @row2col{ <tt>semicolon</tt> , ; }
 633 @row2col{ <tt>less-than-sign</tt> , @< }
 634 @row2col{ <tt>equals-sign</tt> , = }
 635 @endTable
 636 </td>
 637
 638 <td>
 639 @beginTable
 640 @row2col{ <tt>greater-than-sign</tt> , @> }
 641 @row2col{ <tt>question-mark</tt> , ? }
 642 @row2col{ <tt>commercial-at</tt> , @@ }
 643 @row2col{ <tt>left-square-bracket</tt> , [ }
 644 @row2col{ <tt>backslash</tt> , @\ }
 645 @row2col{ <tt>reverse-solidus</tt> , @\ }
 646 @row2col{ <tt>right-square-bracket</tt> , ] }
 647 @row2col{ <tt>circumflex</tt> , ^ }
 648 @row2col{ <tt>circumflex-accent</tt> , ^ }
 649 @row2col{ <tt>underscore</tt> , _ }
 650 @row2col{ <tt>low-line</tt> , _ }
 651 @row2col{ <tt>grave-accent</tt> , ' }
 652 @row2col{ <tt>left-brace</tt> , @leftCurly }
 653 @row2col{ <tt>left-curly-bracket</tt> , @leftCurly }
 654 @row2col{ <tt>vertical-line</tt> , | }
 655 @row2col{ <tt>right-brace</tt> , @rightCurly }
 656 @row2col{ <tt>right-curly-bracket</tt> , @rightCurly }
 657 @row2col{ <tt>tilde</tt> , ~ }
 658 @row2col{ <tt>DEL</tt> , @\177 }
 659 @endTable
 660 </td>
 661
 662 </tr></table></center>
 663
 664 */
 665