docs/doxygen/overviews/resyntax.h

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        resyntax.h
   3 // Purpose:     topic overview
   4 // Author:      wxWidgets team
   5 // RCS-ID:      $Id$
   6 // Licence:     wxWindows licence
   7 /////////////////////////////////////////////////////////////////////////////
   8
   9 /**
  10
  11 @page overview_resyntax Regular Expressions
  12
  13 @tableofcontents
  14
  15 A <em>regular expression</em> describes strings of characters. It's a  pattern
  16 that matches certain strings and doesn't match others.
  17
  18 @see wxRegEx
  19
  20
  21
  22 @section overview_resyntax_differentflavors Different Flavors of Regular Expressions
  23
  24 Regular expressions (RE), as defined by POSIX, come in two flavors:
  25 <em>extended regular expressions</em> (ERE) and <em>basic regular
  26 expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
  27 while BREs are roughly those of the traditional @e ed. This implementation
  28 adds a third flavor: <em>advanced regular expressions</em> (ARE), basically
  29 EREs with some significant extensions.
  30
  31 This manual page primarily describes AREs. BREs mostly exist for backward
  32 compatibility in some old programs. POSIX EREs are almost an exact subset of
  33 AREs. Features of AREs that are not present in EREs will be indicated.
  34
  35
  36 @section overview_resyntax_syntax Regular Expression Syntax
  37
  38 These regular expressions are implemented using the package written by Henry
  39 Spencer, based on the 1003.2 spec and some (not quite all) of the Perl5
  40 extensions (thanks, Henry!).  Much of the description of regular expressions
  41 below is copied verbatim from his manual entry.
  42
  43 An ARE is one or more @e branches, separated by "|", matching anything that
  44 matches any of the branches.
  45
  46 A branch is zero or more @e constraints or @e quantified atoms, concatenated.
  47 It matches a match for the first, followed by a match for the second, etc; an
  48 empty branch matches the empty string.
  49
  50 A quantified atom is an @e atom possibly followed by a single @e quantifier.
  51 Without a quantifier, it matches a match for the atom. The quantifiers, and
  52 what a so-quantified atom matches, are:
  53
  54 @beginTable
  55 @row2col{ <tt>*</tt> ,
  56     A sequence of 0 or more matches of the atom. }
  57 @row2col{ <tt>+</tt> ,
  58     A sequence of 1 or more matches of the atom. }
  59 @row2col{ <tt>?</tt> ,
  60     A sequence of 0 or 1 matches of the atom. }
  61 @row2col{ <tt>{m}</tt> ,
  62     A sequence of exactly @e m matches of the atom. }
  63 @row2col{ <tt>{m\,}</tt> ,
  64     A sequence of @e m or more matches of the atom. }
  65 @row2col{ <tt>{m\,n}</tt> ,
  66     A sequence of @e m through @e n (inclusive) matches of the atom; @e m may
  67     not exceed @e n. }
  68 @row2col{ <tt>*? +? ?? {m}? {m\,}? {m\,n}?</tt> ,
  69     @e Non-greedy quantifiers, which match the same possibilities, but prefer
  70     the smallest number rather than the largest number of matches (see
  71     @ref overview_resyntax_matching). }
  72 @endTable
  73
  74 The forms using @b { and @b } are known as @e bounds. The numbers @e m and
  75 @e n are unsigned decimal integers with permissible values from 0 to 255
  76 inclusive. An atom is one of:
  77
  78 @beginTable
  79 @row2col{ <tt>(re)</tt> ,
  80     Where @e re is any regular expression, matches for @e re, with the match
  81     captured for possible reporting. }
  82 @row2col{ <tt>(?:re)</tt> ,
  83     As previous, but does no reporting (a "non-capturing" set of
  84     parentheses). }
  85 @row2col{ <tt>()</tt> ,
  86     Matches an empty string, captured for possible reporting. }
  87 @row2col{ <tt>(?:)</tt> ,
  88     Matches an empty string, without reporting. }
  89 @row2col{ <tt>[chars]</tt> ,
  90     A <em>bracket expression</em>, matching any one of the @e chars (see
  91     @ref overview_resyntax_bracket for more details). }
  92 @row2col{ <tt>.</tt> ,
  93     Matches any single character. }
  94 @row2col{ <tt>@\k</tt> ,
  95     Where @e k is a non-alphanumeric character, matches that character taken
  96     as an ordinary character, e.g. @\@\ matches a backslash character. }
  97 @row2col{ <tt>@\c</tt> ,
  98     Where @e c is alphanumeric (possibly followed by other characters), an
  99     @e escape (AREs only), see @ref overview_resyntax_escapes below. }
 100 @row2col{ <tt>@leftCurly</tt> ,
 101     When followed by a character other than a digit, matches the left-brace
 102     character "@leftCurly"; when followed by a digit, it is the beginning of a
 103     @e bound (see above). }
 104 @row2col{ <tt>x</tt> ,
 105     Where @e x is a single character with no other significance, matches that
 106     character. }
 107 @endTable
 108
 109 A @e constraint matches an empty string when specific conditions are met. A
 110 constraint may not be followed by a quantifier. The simple constraints are as
 111 follows; some more constraints are described later, under
 112 @ref overview_resyntax_escapes.
 113
 114 @beginTable
 115 @row2col{ <tt>^</tt> ,
 116     Matches at the beginning of a line. }
 117 @row2col{ <tt>@$</tt> ,
 118     Matches at the end of a line. }
 119 @row2col{ <tt>(?=re)</tt> ,
 120     @e Positive lookahead (AREs only), matches at any point where a substring
 121     matching @e re begins. }
 122 @row2col{ <tt>(?!re)</tt> ,
 123     @e Negative lookahead (AREs only), matches at any point where no substring
 124     matching @e re begins. }
 125 @endTable
 126
 127 The lookahead constraints may not contain back references (see later), and all
 128 parentheses within them are considered non-capturing. A RE may not end with
 129 "\".
 130
 131
 132 @section overview_resyntax_bracket Bracket Expressions
 133
 134 A <em>bracket expression</em> is a list of characters enclosed in <tt>[]</tt>.
 135 It normally matches any single character from the list (but see below). If the
 136 list begins with @c ^, it matches any single character (but see below) @e not
 137 from the rest of the list.
 138
 139 If two characters in the list are separated by <tt>-</tt>, this is shorthand
 140 for the full @e range of characters between those two (inclusive) in the
 141 collating sequence, e.g. <tt>[0-9]</tt> in ASCII matches any decimal digit.
 142 Two ranges may not share an endpoint, so e.g. <tt>a-c-e</tt> is illegal.
 143 Ranges are very collating-sequence-dependent, and portable programs should
 144 avoid relying on them.
 145
 146 To include a literal <tt>]</tt> or <tt>-</tt> in the list, the simplest method
 147 is to enclose it in <tt>[.</tt> and <tt>.]</tt> to make it a collating element
 148 (see below). Alternatively, make it the first character (following a possible
 149 <tt>^</tt>), or (AREs only) precede it with <tt>@\</tt>. Alternatively, for
 150 <tt>-</tt>, make it the last character, or the second endpoint of a range. To
 151 use a literal <tt>-</tt> as the first endpoint of a range, make it a collating
 152 element or (AREs only) precede it with <tt>@\</tt>. With the exception of
 153 these, some combinations using <tt>[</tt> (see next paragraphs), and escapes,
 154 all other special characters lose their special significance within a bracket
 155 expression.
 156
 157 Within a bracket expression, a collating element (a character, a
 158 multi-character sequence that collates as if it were a single character, or a
 159 collating-sequence name for either) enclosed in <tt>[.</tt> and <tt>.]</tt>
 160 stands for the sequence of characters of that collating element.
 161
 162 @e wxWidgets: Currently no multi-character collating elements are defined. So
 163 in <tt>[.X.]</tt>, @c X can either be a single character literal or the name
 164 of a character. For example, the following are both identical:
 165 <tt>[[.0.]-[.9.]]</tt> and <tt>[[.zero.]-[.nine.]]</tt> and mean the same as
 166 <tt>[0-9]</tt>. See @ref overview_resyntax_characters.
 167
 168 Within a bracket expression, a collating element enclosed in <tt>[=</tt> and
 169 <tt>=]</tt> is an equivalence class, standing for the sequences of characters
 170 of all collating elements equivalent to that one, including itself. An
 171 equivalence class may not be an endpoint of a range.
 172
 173 @e wxWidgets: Currently no equivalence classes are defined, so <tt>[=X=]</tt>
 174 stands for just the single character @c X. @c X can either be a single
 175 character literal or the name of a character, see
 176 @ref overview_resyntax_characters.
 177
 178 Within a bracket expression, the name of a @e character class enclosed in
 179 <tt>[:</tt> and <tt>:]</tt> stands for the list of all characters (not all
 180 collating elements!) belonging to that class. Standard character classes are:
 181
 182 @beginTable
 183 @row2col{ <tt>alpha</tt>  , A letter. }
 184 @row2col{ <tt>upper</tt>  , An upper-case letter. }
 185 @row2col{ <tt>lower</tt>  , A lower-case letter. }
 186 @row2col{ <tt>digit</tt>  , A decimal digit. }
 187 @row2col{ <tt>xdigit</tt> , A hexadecimal digit. }
 188 @row2col{ <tt>alnum</tt>  , An alphanumeric (letter or digit). }
 189 @row2col{ <tt>print</tt>  , An alphanumeric (same as alnum). }
 190 @row2col{ <tt>blank</tt>  , A space or tab character. }
 191 @row2col{ <tt>space</tt>  , A character producing white space in displayed text. }
 192 @row2col{ <tt>punct</tt>  , A punctuation character. }
 193 @row2col{ <tt>graph</tt>  , A character with a visible representation. }
 194 @row2col{ <tt>cntrl</tt>  , A control character. }
 195 @endTable
 196
 197 A character class may not be used as an endpoint of a range.
 198
 199 @e wxWidgets: In a non-Unicode build, these character classifications depend on
 200 the current locale, and correspond to the values return by the ANSI C "is"
 201 functions: <tt>isalpha</tt>, <tt>isupper</tt>, etc. In Unicode mode they are
 202 based on Unicode classifications, and are not affected by the current locale.
 203
 204 There are two special cases of bracket expressions: the bracket expressions
 205 <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> are constraints, matching empty strings at
 206 the beginning and end of a word respectively.  A word is defined as a sequence
 207 of word characters that is neither preceded nor followed by word characters. A
 208 word character is an @e alnum character or an underscore (_). These special
 209 bracket expressions are deprecated; users of AREs should use constraint escapes
 210 instead (see escapes below).
 211
 212
 213 @section overview_resyntax_escapes Escapes
 214
 215 Escapes (AREs only), which begin with a <tt>@\</tt> followed by an alphanumeric
 216 character, come in several varieties: character entry, class shorthands,
 217 constraint escapes, and back references. A <tt>@\</tt> followed by an
 218 alphanumeric character but not constituting a valid escape is illegal in AREs.
 219 In EREs, there are no escapes: outside a bracket expression, a <tt>@\</tt>
 220 followed by an alphanumeric character merely stands for that character as an
 221 ordinary character, and inside a bracket expression, <tt>@\</tt> is an ordinary
 222 character. (The latter is the one actual incompatibility between EREs and
 223 AREs.)
 224
 225 Character-entry escapes (AREs only) exist to make it easier to specify
 226 non-printing and otherwise inconvenient characters in REs:
 227
 228 @beginTable
 229 @row2col{ <tt>@\a</tt> , Alert (bell) character, as in C. }
 230 @row2col{ <tt>@\b</tt> , Backspace, as in C. }
 231 @row2col{ <tt>@\B</tt> ,
 232     Synonym for <tt>@\</tt> to help reduce backslash doubling in some
 233     applications where there are multiple levels of backslash processing. }
 234 @row2col{ <tt>@\cX</tt> ,
 235     The character whose low-order 5 bits are the same as those of @e X, and
 236     whose other bits are all zero, where @e X is any character. }
 237 @row2col{ <tt>@\e</tt> ,
 238     The character whose collating-sequence name is @c ESC, or failing that,
 239     the character with octal value 033. }
 240 @row2col{ <tt>@\f</tt> , Formfeed, as in C. }
 241 @row2col{ <tt>@\n</tt> , Newline, as in C. }
 242 @row2col{ <tt>@\r</tt> , Carriage return, as in C. }
 243 @row2col{ <tt>@\t</tt> , Horizontal tab, as in C. }
 244 @row2col{ <tt>@\uwxyz</tt> ,
 245     The Unicode character <tt>U+wxyz</tt> in the local byte ordering, where
 246     @e wxyz is exactly four hexadecimal digits. }
 247 @row2col{ <tt>@\Ustuvwxyz</tt> ,
 248     Reserved for a somewhat-hypothetical Unicode extension to 32 bits, where
 249     @e stuvwxyz is exactly eight hexadecimal digits. }
 250 @row2col{ <tt>@\v</tt> , Vertical tab, as in C are all available. }
 251 @row2col{ <tt>@\xhhh</tt> ,
 252     The single character whose hexadecimal value is @e 0xhhh, where @e hhh is
 253     any sequence of hexadecimal digits. }
 254 @row2col{ <tt>@\0</tt> , The character whose value is 0. }
 255 @row2col{ <tt>@\xy</tt> ,
 256     The character whose octal value is @e 0xy, where @e xy is exactly two octal
 257     digits, and is not a <em>back reference</em> (see below). }
 258 @row2col{ <tt>@\xyz</tt> ,
 259     The character whose octal value is @e 0xyz, where @e xyz is exactly three
 260     octal digits, and is not a <em>back reference</em> (see below). }
 261 @endTable
 262
 263 Hexadecimal digits are 0-9, a-f, and A-F. Octal digits are 0-7.
 264
 265 The character-entry escapes are always taken as ordinary characters. For
 266 example, <tt>@\135</tt> is <tt>]</tt> in ASCII, but <tt>@\135</tt> does not
 267 terminate a bracket expression. Beware, however, that some applications (e.g.,
 268 C compilers) interpret  such sequences themselves before the regular-expression
 269 package gets to see them, which may require doubling (quadrupling, etc.) the
 270 '<tt>@\</tt>'.
 271
 272 Class-shorthand escapes (AREs only) provide shorthands for certain
 273 commonly-used character classes:
 274
 275 @beginTable
 276 @row2col{ <tt>@\d</tt> , <tt>[[:digit:]]</tt> }
 277 @row2col{ <tt>@\s</tt> , <tt>[[:space:]]</tt> }
 278 @row2col{ <tt>@\w</tt> , <tt>[[:alnum:]_]</tt> (note underscore) }
 279 @row2col{ <tt>@\D</tt> , <tt>[^[:digit:]]</tt> }
 280 @row2col{ <tt>@\S</tt> , <tt>[^[:space:]]</tt> }
 281 @row2col{ <tt>@\W</tt> , <tt>[^[:alnum:]_]</tt> (note underscore) }
 282 @endTable
 283
 284 Within bracket expressions, <tt>@\d</tt>, <tt>@\s</tt>, and <tt>@\w</tt> lose
 285 their outer brackets, and <tt>@\D</tt>, <tt>@\S</tt>, <tt>@\W</tt> are illegal.
 286 So, for example, <tt>[a-c@\d]</tt> is equivalent to <tt>[a-c[:digit:]]</tt>.
 287 Also, <tt>[a-c@\D]</tt>, which is equivalent to <tt>[a-c^[:digit:]]</tt>, is
 288 illegal.
 289
 290 A constraint escape (AREs only) is a constraint, matching the empty string if
 291 specific conditions are met, written as an escape:
 292
 293 @beginTable
 294 @row2col{ <tt>@\A</tt> , Matches only at the beginning of the string, see
 295                          @ref overview_resyntax_matching for how this differs
 296                          from <tt>^</tt>. }
 297 @row2col{ <tt>@\m</tt> , Matches only at the beginning of a word. }
 298 @row2col{ <tt>@\M</tt> , Matches only at the end of a word. }
 299 @row2col{ <tt>@\y</tt> , Matches only at the beginning or end of a word. }
 300 @row2col{ <tt>@\Y</tt> , Matches only at a point that is not the beginning or
 301                          end of a word. }
 302 @row2col{ <tt>@\Z</tt> , Matches only at the end of the string, see
 303                          @ref overview_resyntax_matching for how this differs
 304                          from <tt>@$</tt>. }
 305 @row2col{ <tt>@\m</tt> , A <em>back reference</em>, where @e m is a non-zero
 306                          digit. See below. }
 307 @row2col{ <tt>@\mnn</tt> ,
 308     A <em>back reference</em>, where @e m is a nonzero digit, and @e nn is some
 309     more digits, and the decimal value @e mnn is not greater than the number of
 310     closing capturing parentheses seen so far. See below. }
 311 @endTable
 312
 313 A word is defined as in the specification of <tt>[[:@<:]]</tt> and
 314 <tt>[[:@>:]]</tt> above. Constraint escapes are illegal within bracket
 315 expressions.
 316
 317 A back reference (AREs only) matches the same string matched by the
 318 parenthesized subexpression specified by the number. For example, "([bc])\1"
 319 matches "bb" or "cc" but not "bc". The subexpression must entirely precede the
 320 back reference in the RE.Subexpressions are numbered in the order of their
 321 leading parentheses. Non-capturing parentheses do not define subexpressions.
 322
 323 There is an inherent historical ambiguity between octal character-entry escapes
 324 and back references, which is resolved by heuristics, as hinted at above. A
 325 leading zero always indicates an octal escape. A single non-zero digit, not
 326 followed by another digit, is always taken as a back reference. A multi-digit
 327 sequence not starting with a zero is taken as a back  reference if it comes
 328 after a suitable subexpression (i.e. the number is in the legal range for a
 329 back reference), and otherwise is taken as octal.
 330
 331
 332 @section overview_resyntax_metasyntax Metasyntax
 333
 334 In addition to the main syntax described above, there are some special forms
 335 and miscellaneous syntactic facilities available.
 336
 337 Normally the flavor of RE being used is specified by application-dependent
 338 means. However, this can be overridden by a @e director. If an RE of any flavor
 339 begins with <tt>***:</tt>, the rest of the RE is an ARE. If an RE of any
 340 flavor begins with <tt>***=</tt>, the rest of the RE is taken to be a literal
 341 string, with all characters considered ordinary characters.
 342
 343 An ARE may begin with <em>embedded options</em>: a sequence <tt>(?xyz)</tt>
 344 (where @e xyz is one or more alphabetic characters) specifies options affecting
 345 the rest of the RE. These supplement, and can override, any options specified
 346 by the application. The available option letters are:
 347
 348 @beginTable
 349 @row2col{ <tt>b</tt> , Rest of RE is a BRE. }
 350 @row2col{ <tt>c</tt> , Case-sensitive matching (usual default). }
 351 @row2col{ <tt>e</tt> , Rest of RE is an ERE. }
 352 @row2col{ <tt>i</tt> , Case-insensitive matching (see
 353                        @ref overview_resyntax_matching, below). }
 354 @row2col{ <tt>m</tt> , Historical synonym for @e n. }
 355 @row2col{ <tt>n</tt> , Newline-sensitive matching (see
 356                        @ref overview_resyntax_matching, below). }
 357 @row2col{ <tt>p</tt> , Partial newline-sensitive matching (see
 358                        @ref overview_resyntax_matching, below). }
 359 @row2col{ <tt>q</tt> , Rest of RE is a literal ("quoted") string, all ordinary
 360                        characters. }
 361 @row2col{ <tt>s</tt> , Non-newline-sensitive matching (usual default). }
 362 @row2col{ <tt>t</tt> , Tight syntax (usual default; see below). }
 363 @row2col{ <tt>w</tt> , Inverse partial newline-sensitive ("weird") matching
 364                        (see @ref overview_resyntax_matching, below). }
 365 @row2col{ <tt>x</tt> , Expanded syntax (see below). }
 366 @endTable
 367
 368 Embedded options take effect at the <tt>)</tt> terminating the sequence. They
 369 are available only at the start of an ARE, and may not be used later within it.
 370
 371 In addition to the usual (@e tight) RE syntax, in which all characters are
 372 significant, there is an @e expanded syntax, available in AREs with the
 373 embedded x option. In the expanded syntax, white-space characters are ignored
 374 and all characters between a <tt>@#</tt> and the following newline (or the end
 375 of the RE) are ignored, permitting paragraphing and commenting a complex RE.
 376 There are three exceptions to that basic rule:
 377
 378 @li A white-space character or <tt>@#</tt> preceded by <tt>@\</tt> is retained.
 379 @li White space or <tt>@#</tt> within a bracket expression is retained.
 380 @li White space and comments are illegal within multi-character symbols like
 381     the ARE <tt>(?:</tt> or the BRE <tt>\(</tt>.
 382
 383 Expanded-syntax white-space characters are blank, tab, newline, and any
 384 character that belongs to the @e space character class.
 385
 386 Finally, in an ARE, outside bracket expressions, the sequence <tt>(?@#ttt)</tt>
 387 (where @e ttt is any text not containing a <tt>)</tt>) is a comment, completely
 388 ignored. Again, this is not allowed between the characters of multi-character
 389 symbols like <tt>(?:</tt>. Such comments are more a historical artifact than a
 390 useful facility, and their use is deprecated; use the expanded syntax instead.
 391
 392 @e None of these metasyntax extensions is available if the application (or an
 393 initial <tt>***=</tt> director) has specified that the user's input be treated
 394 as a literal string rather than as an RE.
 395
 396
 397 @section overview_resyntax_matching Matching
 398
 399 In the event that an RE could match more than one substring of a given string,
 400 the RE matches the one starting earliest in the string. If the RE could match
 401 more than one substring starting at that point, the choice is determined by
 402 it's @e preference: either the longest substring, or the shortest.
 403
 404 Most atoms, and all constraints, have no preference. A parenthesized RE has the
 405 same preference (possibly none) as the RE. A quantified atom with quantifier
 406 <tt>{m}</tt> or <tt>{m}?</tt> has the same preference (possibly none) as the
 407 atom itself. A quantified atom with other normal quantifiers (including
 408 <tt>{m,n}</tt> with @e m equal to @e n) prefers longest match. A quantified
 409 atom with other non-greedy quantifiers (including <tt>{m,n}?</tt> with @e m
 410 equal to @e n) prefers shortest match. A branch has the same preference as the
 411 first quantified atom in it which has a preference. An RE consisting of two or
 412 more branches connected by the @c | operator prefers longest match.
 413
 414 Subject to the constraints imposed by the rules for matching the whole RE,
 415 subexpressions also match the longest or shortest possible substrings, based on
 416 their preferences, with subexpressions starting earlier in the RE taking
 417 priority over ones starting later. Note that outer subexpressions thus take
 418 priority over their component subexpressions.
 419
 420 Note that the quantifiers <tt>{1,1}</tt> and <tt>{1,1}?</tt> can be used to
 421 force longest and shortest preference, respectively, on a subexpression or a
 422 whole RE.
 423
 424 Match lengths are measured in characters, not collating elements. An empty
 425 string is considered longer than no match at all. For example, <tt>bb*</tt>
 426 matches the three middle characters of "abbbc",
 427 <tt>(week|wee)(night|knights)</tt> matches all ten characters of "weeknights",
 428 when <tt>(.*).*</tt> is matched against "abc" the parenthesized subexpression
 429 matches all three characters, and when <tt>(a*)*</tt> is matched against "bc"
 430 both the whole RE and the parenthesized subexpression match an empty string.
 431
 432 If case-independent matching is specified, the effect is much as if all case
 433 distinctions had vanished from the alphabet. When an alphabetic that exists in
 434 multiple cases appears as an ordinary character outside a bracket expression,
 435 it is effectively transformed into a bracket expression containing both cases,
 436 so that @c x becomes @c [xX]. When it appears inside a bracket expression, all
 437 case counterparts of it are added to the bracket expression, so that @c [x]
 438 becomes @c [xX] and @c [^x] becomes @c [^xX].
 439
 440 If newline-sensitive matching is specified, "." and bracket expressions using
 441 "^" will never match the newline character (so that matches will never cross
 442 newlines unless the RE explicitly arranges it) and "^" and "$" will match the
 443 empty string after and before a newline respectively, in addition to matching
 444 at beginning and end of string respectively. ARE <tt>@\A</tt> and <tt>@\Z</tt>
 445 continue to match beginning or end of string @e only.
 446
 447 If partial newline-sensitive matching is specified, this affects "." and
 448 bracket expressions as with newline-sensitive matching, but not "^" and "$".
 449
 450 If inverse partial newline-sensitive matching is specified, this affects "^"
 451 and "$" as with newline-sensitive matching, but not "." and bracket
 452 expressions. This isn't very useful but is provided for symmetry.
 453
 454
 455 @section overview_resyntax_limits Limits and Compatibility
 456
 457 No particular limit is imposed on the length of REs. Programs intended to be
 458 highly portable should not employ REs longer than 256 bytes, as a
 459 POSIX-compliant implementation can refuse to accept such REs.
 460
 461 The only feature of AREs that is actually incompatible with POSIX EREs is that
 462 <tt>@\</tt> does not lose its special significance inside bracket expressions.
 463 All other ARE features use syntax which is illegal or has undefined or
 464 unspecified effects in POSIX EREs; the <tt>***</tt> syntax of directors
 465 likewise is outside the POSIX syntax for both BREs and EREs.
 466
 467 Many of the ARE extensions are borrowed from Perl, but some have been changed
 468 to clean them up, and a few Perl extensions are not present. Incompatibilities
 469 of note include <tt>@\b</tt>, <tt>@\B</tt>, the lack of special treatment for a
 470 trailing newline, the addition of complemented bracket expressions to the
 471 things affected by newline-sensitive matching, the restrictions on parentheses
 472 and back references in lookahead constraints, and the longest/shortest-match
 473 (rather than first-match) matching semantics.
 474
 475 The matching rules for REs containing both normal and non-greedy quantifiers
 476 have changed since early beta-test versions of this package. The new rules are
 477 much simpler and cleaner, but don't work as hard at guessing the user's real
 478 intentions.
 479
 480 Henry Spencer's original 1986 @e regexp package, still in widespread use,
 481 implemented an early version of today's EREs. There are four incompatibilities
 482 between @e regexp's near-EREs (RREs for short) and AREs. In roughly increasing
 483 order of significance:
 484
 485 @li In AREs, <tt>@\</tt> followed by an alphanumeric character is either an
 486     escape or an error, while in RREs, it was just another way of writing the
 487     alphanumeric. This should not be a problem because there was no reason to
 488     write such a sequence in RREs.
 489 @li @c { followed by a digit in an ARE is the beginning of a bound, while in
 490     RREs, @c { was always an ordinary character. Such sequences should be rare,
 491     and will often result in an error because following characters will not
 492     look like a valid bound.
 493 @li In AREs, @c @\ remains a special character within @c [], so a literal @c @\
 494     within @c [] must be written as <tt>@\@\</tt>. <tt>@\@\</tt> also gives a
 495     literal @c @\ within @c [] in RREs, but only truly paranoid programmers
 496     routinely doubled the backslash.
 497 @li AREs report the longest/shortest match for the RE, rather than the first
 498     found in a specified search order. This may affect some RREs which were
 499     written in the expectation that the first match would be reported. The
 500     careful crafting of RREs to optimize the search order for fast matching is
 501     obsolete (AREs examine all possible matches in parallel, and their
 502     performance is largely insensitive to their complexity) but cases where the
 503     search order was exploited to deliberately find a match which was @e not
 504     the longest/shortest will need rewriting.
 505
 506
 507 @section overview_resyntax_bre Basic Regular Expressions
 508
 509 BREs differ from EREs in several respects. @c |, @c +, and @c ? are ordinary
 510 characters and there is no equivalent for their functionality. The delimiters
 511 for bounds are @c @\{ and @c @\}, with @c { and @c } by themselves ordinary
 512 characters. The parentheses for nested subexpressions are @c @\( and @c @\),
 513 with @c ( and @c ) by themselves ordinary characters. @c ^ is an ordinary
 514 character except at the beginning of the RE or the beginning of a parenthesized
 515 subexpression, @c $ is an ordinary character except at the end of the RE or the
 516 end of a parenthesized subexpression, and @c * is an ordinary character if it
 517 appears at the beginning of the RE or the beginning of a parenthesized
 518 subexpression (after a possible leading <tt>^</tt>). Finally, single-digit back
 519 references are available, and @c @\@< and @c @\@> are synonyms for
 520 <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> respectively; no other escapes are
 521 available.
 522
 523
 524 @section overview_resyntax_characters Regular Expression Character Names
 525
 526 Note that the character names are case sensitive.
 527
 528 <center><table class='doctable' border='0' cellspacing='5' cellpadding='4'><tr>
 529
 530 <td>
 531 @beginTable
 532 @row2col{ <tt>NUL</tt> , @\0 }
 533 @row2col{ <tt>SOH</tt> , @\001 }
 534 @row2col{ <tt>STX</tt> , @\002 }
 535 @row2col{ <tt>ETX</tt> , @\003 }
 536 @row2col{ <tt>EOT</tt> , @\004 }
 537 @row2col{ <tt>ENQ</tt> , @\005 }
 538 @row2col{ <tt>ACK</tt> , @\006 }
 539 @row2col{ <tt>BEL</tt> , @\007 }
 540 @row2col{ <tt>alert</tt> , @\007 }
 541 @row2col{ <tt>BS</tt> , @\010 }
 542 @row2col{ <tt>backspace</tt> , @\b }
 543 @row2col{ <tt>HT</tt> , @\011 }
 544 @row2col{ <tt>tab</tt> , @\t }
 545 @row2col{ <tt>LF</tt> , @\012 }
 546 @row2col{ <tt>newline</tt> , @\n }
 547 @row2col{ <tt>VT</tt> , @\013 }
 548 @row2col{ <tt>vertical-tab</tt> , @\v }
 549 @row2col{ <tt>FF</tt> , @\014 }
 550 @row2col{ <tt>form-feed</tt> , @\f }
 551 @endTable
 552 </td>
 553
 554 <td>
 555 @beginTable
 556 @row2col{ <tt>CR</tt> , @\015 }
 557 @row2col{ <tt>carriage-return</tt> , @\r }
 558 @row2col{ <tt>SO</tt> , @\016 }
 559 @row2col{ <tt>SI</tt> , @\017 }
 560 @row2col{ <tt>DLE</tt> , @\020 }
 561 @row2col{ <tt>DC1</tt> , @\021 }
 562 @row2col{ <tt>DC2</tt> , @\022 }
 563 @row2col{ <tt>DC3</tt> , @\023 }
 564 @row2col{ <tt>DC4</tt> , @\024 }
 565 @row2col{ <tt>NAK</tt> , @\025 }
 566 @row2col{ <tt>SYN</tt> , @\026 }
 567 @row2col{ <tt>ETB</tt> , @\027 }
 568 @row2col{ <tt>CAN</tt> , @\030 }
 569 @row2col{ <tt>EM</tt> , @\031 }
 570 @row2col{ <tt>SUB</tt> , @\032 }
 571 @row2col{ <tt>ESC</tt> , @\033 }
 572 @row2col{ <tt>IS4</tt> , @\034 }
 573 @row2col{ <tt>FS</tt> , @\034 }
 574 @row2col{ <tt>IS3</tt> , @\035 }
 575 @endTable
 576 </td>
 577
 578 <td>
 579 @beginTable
 580 @row2col{ <tt>GS</tt> , @\035 }
 581 @row2col{ <tt>IS2</tt> , @\036 }
 582 @row2col{ <tt>RS</tt> , @\036 }
 583 @row2col{ <tt>IS1</tt> , @\037 }
 584 @row2col{ <tt>US</tt> , @\037 }
 585 @row2col{ <tt>space</tt> , " " (space) }
 586 @row2col{ <tt>exclamation-mark</tt> , ! }
 587 @row2col{ <tt>quotation-mark</tt> , " }
 588 @row2col{ <tt>number-sign</tt> , @# }
 589 @row2col{ <tt>dollar-sign</tt> , @$ }
 590 @row2col{ <tt>percent-sign</tt> , @% }
 591 @row2col{ <tt>ampersand</tt> , @& }
 592 @row2col{ <tt>apostrophe</tt> , ' }
 593 @row2col{ <tt>left-parenthesis</tt> , ( }
 594 @row2col{ <tt>right-parenthesis</tt> , ) }
 595 @row2col{ <tt>asterisk</tt> , * }
 596 @row2col{ <tt>plus-sign</tt> , + }
 597 @row2col{ <tt>comma</tt> , \, }
 598 @row2col{ <tt>hyphen</tt> , - }
 599 @endTable
 600 </td>
 601
 602 <td>
 603 @beginTable
 604 @row2col{ <tt>hyphen-minus</tt> , - }
 605 @row2col{ <tt>period</tt> , . }
 606 @row2col{ <tt>full-stop</tt> , . }
 607 @row2col{ <tt>slash</tt> , / }
 608 @row2col{ <tt>solidus</tt> , / }
 609 @row2col{ <tt>zero</tt> , 0 }
 610 @row2col{ <tt>one</tt> , 1 }
 611 @row2col{ <tt>two</tt> , 2 }
 612 @row2col{ <tt>three</tt> , 3 }
 613 @row2col{ <tt>four</tt> , 4 }
 614 @row2col{ <tt>five</tt> , 5 }
 615 @row2col{ <tt>six</tt> , 6 }
 616 @row2col{ <tt>seven</tt> , 7 }
 617 @row2col{ <tt>eight</tt> , 8 }
 618 @row2col{ <tt>nine</tt> , 9 }
 619 @row2col{ <tt>colon</tt> , : }
 620 @row2col{ <tt>semicolon</tt> , ; }
 621 @row2col{ <tt>less-than-sign</tt> , @< }
 622 @row2col{ <tt>equals-sign</tt> , = }
 623 @endTable
 624 </td>
 625
 626 <td>
 627 @beginTable
 628 @row2col{ <tt>greater-than-sign</tt> , @> }
 629 @row2col{ <tt>question-mark</tt> , ? }
 630 @row2col{ <tt>commercial-at</tt> , @@ }
 631 @row2col{ <tt>left-square-bracket</tt> , [ }
 632 @row2col{ <tt>backslash</tt> , @\ }
 633 @row2col{ <tt>reverse-solidus</tt> , @\ }
 634 @row2col{ <tt>right-square-bracket</tt> , ] }
 635 @row2col{ <tt>circumflex</tt> , ^ }
 636 @row2col{ <tt>circumflex-accent</tt> , ^ }
 637 @row2col{ <tt>underscore</tt> , _ }
 638 @row2col{ <tt>low-line</tt> , _ }
 639 @row2col{ <tt>grave-accent</tt> , ' }
 640 @row2col{ <tt>left-brace</tt> , @leftCurly }
 641 @row2col{ <tt>left-curly-bracket</tt> , @leftCurly }
 642 @row2col{ <tt>vertical-line</tt> , | }
 643 @row2col{ <tt>right-brace</tt> , @rightCurly }
 644 @row2col{ <tt>right-curly-bracket</tt> , @rightCurly }
 645 @row2col{ <tt>tilde</tt> , ~ }
 646 @row2col{ <tt>DEL</tt> , @\177 }
 647 @endTable
 648 </td>
 649
 650 </tr></table></center>
 651
 652 */