X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/877b5c30d6390744e4fc010c538fc6708251f7a8..e215c9959cfae9db319cbca376553301dfa17cf1:/docs/doxygen/overviews/resyntax.h?ds=sidebyside diff --git a/docs/doxygen/overviews/resyntax.h b/docs/doxygen/overviews/resyntax.h index 501294fe9c..dae9b1ce4d 100644 --- a/docs/doxygen/overviews/resyntax.h +++ b/docs/doxygen/overviews/resyntax.h @@ -6,9 +6,9 @@ // Licence: wxWindows license ///////////////////////////////////////////////////////////////////////////// -/*! +/** -@page overview_resyntax Syntax of the Built-in Regular Expression Library +@page overview_resyntax Regular Expressions A regular expression describes strings of characters. It's a pattern that matches certain strings and doesn't match others. @@ -23,9 +23,9 @@ that matches certain strings and doesn't match others. @li @ref overview_resyntax_bre @li @ref overview_resyntax_characters -@seealso +@see -@li #wxRegEx +@li wxRegEx
@@ -177,18 +177,19 @@ of a character. For example, the following are both identical: [[.0.]-[.9.]] and [[.zero.]-[.nine.]] and mean the same as [0-9]. See @ref overview_resyntax_characters. -Within a bracket expression, a collating element enclosed in @b [= and @b =] -is an equivalence class, standing for the sequences of characters of all -collating elements equivalent to that one, including itself. -An equivalence class may not be an endpoint of a range. -@e wxWidgets: Currently no equivalence classes are defined, so -@b [=X=] stands for just the single character @e X. -@e X can either be a single character literal or the name of a character, -see @ref resynchars_overview. -Within a bracket expression, -the name of a @e character class enclosed in @b [: and @b :] stands for the list -of all characters (not all collating elements!) belonging to that class. -Standard character classes are: +Within a bracket expression, a collating element enclosed in [= and +=] is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. An +equivalence class may not be an endpoint of a range. + +@e wxWidgets: Currently no equivalence classes are defined, so [=X=] +stands for just the single character @c X. @c X can either be a single +character literal or the name of a character, see +@ref overview_resyntax_characters. + +Within a bracket expression, the name of a @e character class enclosed in +[: and :] stands for the list of all characters (not all +collating elements!) belonging to that class. Standard character classes are: @beginTable @row2col{ alpha , A letter. } @@ -206,1504 +207,459 @@ Standard character classes are: @endTable A character class may not be used as an endpoint of a range. -@e wxWidgets: In a non-Unicode build, these character classifications depend on the -current locale, and correspond to the values return by the ANSI C 'is' -functions: isalpha, isupper, etc. In Unicode mode they are based on -Unicode classifications, and are not affected by the current locale. -There are two special cases of bracket expressions: -the bracket expressions @b [[::]] and @b [[::]] are constraints, matching empty -strings at the beginning and end of a word respectively. A word is defined -as a sequence of word characters that is neither preceded nor followed -by word characters. A word character is an @e alnum character or an underscore -(@b _). These special bracket expressions are deprecated; users of AREs should -use constraint escapes instead (see #Escapes below). - - -@section overview_resyntax_escapes Escapes - -Escapes (AREs only), -which begin with a @\ followed by an alphanumeric character, come in several -varieties: character entry, class shorthands, constraint escapes, and back -references. A @\ followed by an alphanumeric character but not constituting -a valid escape is illegal in AREs. In EREs, there are no escapes: outside -a bracket expression, a @\ followed by an alphanumeric character merely stands -for that character as an ordinary character, and inside a bracket expression, -@\ is an ordinary character. (The latter is the one actual incompatibility -between EREs and AREs.) -Character-entry escapes (AREs only) exist to make -it easier to specify non-printing and otherwise inconvenient characters -in REs: - - - -@b \a - -alert (bell) character, as in C - -@b \b - -backspace, as in C - -@b \B - -synonym -for @b \ to help reduce backslash doubling in some applications where there -are multiple levels of backslash processing - -@b \c@e X - -(where X is any character) -the character whose low-order 5 bits are the same as those of @e X, and whose -other bits are all zero - -@b \e - -the character whose collating-sequence name is -'@b ESC', or failing that, the character with octal value 033 - -@b \f - -formfeed, as in C - -@b \n - -newline, as in C - -@b \r - -carriage return, as in C - -@b \t - -horizontal tab, as in C - -@b \u@e wxyz - -(where @e wxyz is exactly four hexadecimal digits) -the Unicode -character @b U+@e wxyz in the local byte ordering - -@b \U@e stuvwxyz - -(where @e stuvwxyz is -exactly eight hexadecimal digits) reserved for a somewhat-hypothetical Unicode -extension to 32 bits - -@b \v - -vertical tab, as in C are all available. - -@b \x@e hhh - -(where -@e hhh is any sequence of hexadecimal digits) the character whose hexadecimal -value is @b 0x@e hhh (a single character no matter how many hexadecimal digits -are used). - -@b \0 - -the character whose value is @b 0 - -@b \@e xy - -(where @e xy is exactly two -octal digits, and is not a @e back reference (see below)) the character whose -octal value is @b 0@e xy - -@b \@e xyz - -(where @e xyz is exactly three octal digits, and is -not a back reference (see below)) -the character whose octal value is @b 0@e xyz - - - -Hexadecimal digits are '@b 0'-'@b 9', '@b a'-'@b f', and '@b A'-'@b F'. Octal -digits are '@b 0'-'@b 7'. -The character-entry -escapes are always taken as ordinary characters. For example, @b \135 is @b ] in -ASCII, but @b \135 does not terminate a bracket expression. Beware, however, -that some applications (e.g., C compilers) interpret such sequences themselves -before the regular-expression package gets to see them, which may require -doubling (quadrupling, etc.) the '@b \'. -Class-shorthand escapes (AREs only) provide -shorthands for certain commonly-used character classes: - - - -@b \d - -@b [[:digit:]] - -@b \s - -@b [[:space:]] - -@b \w - -@b [[:alnum:]_] (note underscore) - -@b \D - -@b [^[:digit:]] - -@b \S - -@b [^[:space:]] - -@b \W - -@b [^[:alnum:]_] (note underscore) - - - -Within bracket expressions, '@b \d', '@b \s', and -'@b \w' lose their outer brackets, and '@b \D', -'@b \S', and '@b \W' are illegal. (So, for example, -@b [a-c\d] is equivalent to @b [a-c[:digit:]]. -Also, @b [a-c\D], which is equivalent to -@b [a-c^[:digit:]], is illegal.) -A constraint escape (AREs only) is a constraint, -matching the empty string if specific conditions are met, written as an -escape: - - - -@b \A - -matches only at the beginning of the string -(see #Matching, below, -for how this differs from '@b ^') -@b \m +@e wxWidgets: In a non-Unicode build, these character classifications depend on +the current locale, and correspond to the values return by the ANSI C "is" +functions: isalpha, isupper, etc. In Unicode mode they are +based on Unicode classifications, and are not affected by the current locale. -matches only at the beginning of a word +There are two special cases of bracket expressions: the bracket expressions +[[:@<:]] and [[:@>:]] are constraints, matching empty strings at +the beginning and end of a word respectively. A word is defined as a sequence +of word characters that is neither preceded nor followed by word characters. A +word character is an @e alnum character or an underscore (_). These special +bracket expressions are deprecated; users of AREs should use constraint escapes +instead (see escapes below). -@b \M -matches only at the end of a word +@section overview_resyntax_escapes Escapes -@b \y +Escapes (AREs only), which begin with a @\ followed by an alphanumeric +character, come in several varieties: character entry, class shorthands, +constraint escapes, and back references. A @\ followed by an +alphanumeric character but not constituting a valid escape is illegal in AREs. +In EREs, there are no escapes: outside a bracket expression, a @\ +followed by an alphanumeric character merely stands for that character as an +ordinary character, and inside a bracket expression, @\ is an ordinary +character. (The latter is the one actual incompatibility between EREs and +AREs.) -matches only at the beginning or end of a word +Character-entry escapes (AREs only) exist to make it easier to specify +non-printing and otherwise inconvenient characters in REs: -@b \Y +@beginTable +@row2col{ @\a , Alert (bell) character, as in C. } +@row2col{ @\b , Backspace, as in C. } +@row2col{ @\B , + Synonym for @\ to help reduce backslash doubling in some + applications where there are multiple levels of backslash processing. } +@row2col{ @\cX , + The character whose low-order 5 bits are the same as those of @e X, and + whose other bits are all zero, where @e X is any character. } +@row2col{ @\e , + The character whose collating-sequence name is @c ESC, or failing that, + the character with octal value 033. } +@row2col{ @\f , Formfeed, as in C. } +@row2col{ @\n , Newline, as in C. } +@row2col{ @\r , Carriage return, as in C. } +@row2col{ @\t , Horizontal tab, as in C. } +@row2col{ @\uwxyz , + The Unicode character U+wxyz in the local byte ordering, where + @e wxyz is exactly four hexadecimal digits. } +@row2col{ @\Ustuvwxyz , + Reserved for a somewhat-hypothetical Unicode extension to 32 bits, where + @e stuvwxyz is exactly eight hexadecimal digits. } +@row2col{ @\v , Vertical tab, as in C are all available. } +@row2col{ @\xhhh , + The single character whose hexadecimal value is @e 0xhhh, where @e hhh is + any sequence of hexadecimal digits. } +@row2col{ @\0 , The character whose value is 0. } +@row2col{ @\xy , + The character whose octal value is @e 0xy, where @e xy is exactly two octal + digits, and is not a back reference (see below). } +@row2col{ @\xyz , + The character whose octal value is @e 0xyz, where @e xyz is exactly three + octal digits, and is not a back reference (see below). } +@endTable -matches only at a point that is not the beginning or end of -a word +Hexadecimal digits are 0-9, a-f, and A-F. Octal digits are 0-7. -@b \Z +The character-entry escapes are always taken as ordinary characters. For +example, @\135 is ] in ASCII, but @\135 does not +terminate a bracket expression. Beware, however, that some applications (e.g., +C compilers) interpret such sequences themselves before the regular-expression +package gets to see them, which may require doubling (quadrupling, etc.) the +'@\'. -matches only at the end of the string -(see #Matching, below, for -how this differs from '@b $') +Class-shorthand escapes (AREs only) provide shorthands for certain +commonly-used character classes: -@b \@e m +@beginTable +@row2col{ @\d , [[:digit:]] } +@row2col{ @\s , [[:space:]] } +@row2col{ @\w , [[:alnum:]_] (note underscore) } +@row2col{ @\D , [^[:digit:]] } +@row2col{ @\S , [^[:space:]] } +@row2col{ @\W , [^[:alnum:]_] (note underscore) } +@endTable -(where @e m is a nonzero digit) a @e back reference, -see below +Within bracket expressions, @\d, @\s, and @\w lose +their outer brackets, and @\D, @\S, @\W are illegal. +So, for example, [a-c@\d] is equivalent to [a-c[:digit:]]. +Also, [a-c@\D], which is equivalent to [a-c^[:digit:]], is +illegal. -@b \@e mnn +A constraint escape (AREs only) is a constraint, matching the empty string if +specific conditions are met, written as an escape: -(where @e m is a nonzero digit, and @e nn is some more digits, -and the decimal value @e mnn is not greater than the number of closing capturing -parentheses seen so far) a @e back reference, see below +@beginTable +@row2col{ @\A , Matches only at the beginning of the string, see + @ref overview_resyntax_matching for how this differs + from ^. } +@row2col{ @\m , Matches only at the beginning of a word. } +@row2col{ @\M , Matches only at the end of a word. } +@row2col{ @\y , Matches only at the beginning or end of a word. } +@row2col{ @\Y , Matches only at a point that is not the beginning or + end of a word. } +@row2col{ @\Z , Matches only at the end of the string, see + @ref overview_resyntax_matching for how this differs + from @$. } +@row2col{ @\m , A back reference, where @e m is a non-zero + digit. See below. } +@row2col{ @\mnn , + A back reference, where @e m is a nonzero digit, and @e nn is some + more digits, and the decimal value @e mnn is not greater than the number of + closing capturing parentheses seen so far. See below. } +@endTable +A word is defined as in the specification of [[:@<:]] and +[[:@>:]] above. Constraint escapes are illegal within bracket +expressions. +A back reference (AREs only) matches the same string matched by the +parenthesized subexpression specified by the number. For example, "([bc])\1" +matches "bb" or "cc" but not "bc". The subexpression must entirely precede the +back reference in the RE.Subexpressions are numbered in the order of their +leading parentheses. Non-capturing parentheses do not define subexpressions. -A word is defined -as in the specification of @b [[::]] and @b [[::]] above. Constraint escapes are -illegal within bracket expressions. -A back reference (AREs only) matches -the same string matched by the parenthesized subexpression specified by -the number, so that (e.g.) @b ([bc])\1 matches @b bb or @b cc but not '@b bc'. -The subexpression -must entirely precede the back reference in the RE. Subexpressions are numbered -in the order of their leading parentheses. Non-capturing parentheses do not -define subexpressions. -There is an inherent historical ambiguity between -octal character-entry escapes and back references, which is resolved by -heuristics, as hinted at above. A leading zero always indicates an octal -escape. A single non-zero digit, not followed by another digit, is always -taken as a back reference. A multi-digit sequence not starting with a zero -is taken as a back reference if it comes after a suitable subexpression -(i.e. the number is in the legal range for a back reference), and otherwise -is taken as octal. +There is an inherent historical ambiguity between octal character-entry escapes +and back references, which is resolved by heuristics, as hinted at above. A +leading zero always indicates an octal escape. A single non-zero digit, not +followed by another digit, is always taken as a back reference. A multi-digit +sequence not starting with a zero is taken as a back reference if it comes +after a suitable subexpression (i.e. the number is in the legal range for a +back reference), and otherwise is taken as octal. @section overview_resyntax_metasyntax Metasyntax -In addition to the main syntax described above, -there are some special forms and miscellaneous syntactic facilities available. +In addition to the main syntax described above, there are some special forms +and miscellaneous syntactic facilities available. + Normally the flavor of RE being used is specified by application-dependent means. However, this can be overridden by a @e director. If an RE of any flavor -begins with '@b ***:', the rest of the RE is an ARE. If an RE of any flavor begins -with '@b ***=', the rest of the RE is taken to be a literal string, with all -characters considered ordinary characters. -An ARE may begin with @e embedded options: a sequence @b (?xyz) -(where @e xyz is one or more alphabetic characters) -specifies options affecting the rest of the RE. These supplement, and can -override, any options specified by the application. The available option -letters are: - - - -@b b - -rest of RE is a BRE - -@b c - -case-sensitive matching (usual default) - -@b e - -rest of RE is an ERE - -@b i - -case-insensitive matching (see #Matching, below) - -@b m - -historical synonym for @b n - -@b n - -newline-sensitive matching (see #Matching, below) - -@b p - -partial newline-sensitive matching (see #Matching, below) - -@b q - -rest of RE -is a literal ("quoted'') string, all ordinary characters - -@b s - -non-newline-sensitive matching (usual default) - -@b t - -tight syntax (usual default; see below) - -@b w - -inverse -partial newline-sensitive ("weird'') matching (see #Matching, below) +begins with ***:, the rest of the RE is an ARE. If an RE of any +flavor begins with ***=, the rest of the RE is taken to be a literal +string, with all characters considered ordinary characters. -@b x - -expanded syntax (see below) +An ARE may begin with embedded options: a sequence (?xyz) +(where @e xyz is one or more alphabetic characters) specifies options affecting +the rest of the RE. These supplement, and can override, any options specified +by the application. The available option letters are: +@beginTable +@row2col{ b , Rest of RE is a BRE. } +@row2col{ c , Case-sensitive matching (usual default). } +@row2col{ e , Rest of RE is an ERE. } +@row2col{ i , Case-insensitive matching (see + @ref overview_resyntax_matching, below). } +@row2col{ m , Historical synonym for @e n. } +@row2col{ n , Newline-sensitive matching (see + @ref overview_resyntax_matching, below). } +@row2col{ p , Partial newline-sensitive matching (see + @ref overview_resyntax_matching, below). } +@row2col{ q , Rest of RE is a literal ("quoted") string, all ordinary + characters. } +@row2col{ s , Non-newline-sensitive matching (usual default). } +@row2col{ t , Tight syntax (usual default; see below). } +@row2col{ w , Inverse partial newline-sensitive ("weird") matching + (see @ref overview_resyntax_matching, below). } +@row2col{ x , Expanded syntax (see below). } +@endTable +Embedded options take effect at the ) terminating the sequence. They +are available only at the start of an ARE, and may not be used later within it. -Embedded options take effect at the @b ) terminating the -sequence. They are available only at the start of an ARE, and may not be -used later within it. -In addition to the usual (@e tight) RE syntax, in which -all characters are significant, there is an @e expanded syntax, available -in AREs with the embedded -x option. In the expanded syntax, white-space characters are ignored and -all characters between a @b # and the following newline (or the end of the -RE) are ignored, permitting paragraphing and commenting a complex RE. There -are three exceptions to that basic rule: +In addition to the usual (@e tight) RE syntax, in which all characters are +significant, there is an @e expanded syntax, available in AREs with the +embedded x option. In the expanded syntax, white-space characters are ignored +and all characters between a @# and the following newline (or the end +of the RE) are ignored, permitting paragraphing and commenting a complex RE. +There are three exceptions to that basic rule: +@li A white-space character or @# preceded by @\ is retained. +@li White space or @# within a bracket expression is retained. +@li White space and comments are illegal within multi-character symbols like + the ARE (?: or the BRE \(. -a white-space character or '@b #' preceded -by '@b \' is retained -white space or '@b #' within a bracket expression is retained -white space and comments are illegal within multi-character symbols like -the ARE '@b (?:' or the BRE '@b \(' +Expanded-syntax white-space characters are blank, tab, newline, and any +character that belongs to the @e space character class. +Finally, in an ARE, outside bracket expressions, the sequence (?@#ttt) +(where @e ttt is any text not containing a )) is a comment, completely +ignored. Again, this is not allowed between the characters of multi-character +symbols like (?:. Such comments are more a historical artifact than a +useful facility, and their use is deprecated; use the expanded syntax instead. -Expanded-syntax white-space characters are blank, -tab, newline, and any character that belongs to the @e space character class. -Finally, in an ARE, outside bracket expressions, the sequence '@b (?#ttt)' (where -@e ttt is any text not containing a '@b )') is a comment, completely ignored. Again, -this is not allowed between the characters of multi-character symbols like -'@b (?:'. Such comments are more a historical artifact than a useful facility, -and their use is deprecated; use the expanded syntax instead. -@e None of these -metasyntax extensions is available if the application (or an initial @b ***= -director) has specified that the user's input be treated as a literal string -rather than as an RE. +@e None of these metasyntax extensions is available if the application (or an +initial ***= director) has specified that the user's input be treated +as a literal string rather than as an RE. @section overview_resyntax_matching Matching -In the event that an RE could match more than -one substring of a given string, the RE matches the one starting earliest -in the string. If the RE could match more than one substring starting at -that point, its choice is determined by its @e preference: either the longest -substring, or the shortest. -Most atoms, and all constraints, have no preference. -A parenthesized RE has the same preference (possibly none) as the RE. A -quantified atom with quantifier @b {m} or @b {m}? has the same preference (possibly -none) as the atom itself. A quantified atom with other normal quantifiers -(including @b {m,n} with @e m equal to @e n) prefers longest match. A quantified -atom with other non-greedy quantifiers (including @b {m,n}? with @e m equal to -@e n) prefers shortest match. A branch has the same preference as the first -quantified atom in it which has a preference. An RE consisting of two or -more branches connected by the @b | operator prefers longest match. -Subject to the constraints imposed by the rules for matching the whole RE, subexpressions -also match the longest or shortest possible substrings, based on their -preferences, with subexpressions starting earlier in the RE taking priority -over ones starting later. Note that outer subexpressions thus take priority -over their component subexpressions. -Note that the quantifiers @b {1,1} and -@b {1,1}? can be used to force longest and shortest preference, respectively, -on a subexpression or a whole RE. -Match lengths are measured in characters, -not collating elements. An empty string is considered longer than no match -at all. For example, @b bb* matches the three middle characters -of '@b abbbc', @b (week|wee)(night|knights) -matches all ten characters of '@b weeknights', when @b (.*).* is matched against -@b abc the parenthesized subexpression matches all three characters, and when -@b (a*)* is matched against @b bc both the whole RE and the parenthesized subexpression -match an empty string. -If case-independent matching is specified, the effect -is much as if all case distinctions had vanished from the alphabet. When -an alphabetic that exists in multiple cases appears as an ordinary character -outside a bracket expression, it is effectively transformed into a bracket -expression containing both cases, so that @b x becomes '@b [xX]'. When it appears -inside a bracket expression, all case counterparts of it are added to the -bracket expression, so that @b [x] becomes @b [xX] and @b [^x] becomes '@b [^xX]'. -If newline-sensitive -matching is specified, @b . and bracket expressions using @b ^ will never match -the newline character (so that matches will never cross newlines unless -the RE explicitly arranges it) and @b ^ and @b $ will match the empty string after -and before a newline respectively, in addition to matching at beginning -and end of string respectively. ARE @b \A and @b \Z continue to match beginning -or end of string @e only. -If partial newline-sensitive matching is specified, -this affects @b . and bracket expressions as with newline-sensitive matching, -but not @b ^ and '@b $'. -If inverse partial newline-sensitive matching is specified, -this affects @b ^ and @b $ as with newline-sensitive matching, but not @b . and bracket +In the event that an RE could match more than one substring of a given string, +the RE matches the one starting earliest in the string. If the RE could match +more than one substring starting at that point, the choice is determined by +it's @e preference: either the longest substring, or the shortest. + +Most atoms, and all constraints, have no preference. A parenthesized RE has the +same preference (possibly none) as the RE. A quantified atom with quantifier +{m} or {m}? has the same preference (possibly none) as the +atom itself. A quantified atom with other normal quantifiers (including +{m,n} with @e m equal to @e n) prefers longest match. A quantified +atom with other non-greedy quantifiers (including {m,n}? with @e m +equal to @e n) prefers shortest match. A branch has the same preference as the +first quantified atom in it which has a preference. An RE consisting of two or +more branches connected by the @c | operator prefers longest match. + +Subject to the constraints imposed by the rules for matching the whole RE, +subexpressions also match the longest or shortest possible substrings, based on +their preferences, with subexpressions starting earlier in the RE taking +priority over ones starting later. Note that outer subexpressions thus take +priority over their component subexpressions. + +Note that the quantifiers {1,1} and {1,1}? can be used to +force longest and shortest preference, respectively, on a subexpression or a +whole RE. + +Match lengths are measured in characters, not collating elements. An empty +string is considered longer than no match at all. For example, bb* +matches the three middle characters of "abbbc", +(week|wee)(night|knights) matches all ten characters of "weeknights", +when (.*).* is matched against "abc" the parenthesized subexpression +matches all three characters, and when (a*)* is matched against "bc" +both the whole RE and the parenthesized subexpression match an empty string. + +If case-independent matching is specified, the effect is much as if all case +distinctions had vanished from the alphabet. When an alphabetic that exists in +multiple cases appears as an ordinary character outside a bracket expression, +it is effectively transformed into a bracket expression containing both cases, +so that @c x becomes @c [xX]. When it appears inside a bracket expression, all +case counterparts of it are added to the bracket expression, so that @c [x] +becomes @c [xX] and @c [^x] becomes @c [^xX]. + +If newline-sensitive matching is specified, "." and bracket expressions using +"^" will never match the newline character (so that matches will never cross +newlines unless the RE explicitly arranges it) and "^" and "$" will match the +empty string after and before a newline respectively, in addition to matching +at beginning and end of string respectively. ARE @\A and @\Z +continue to match beginning or end of string @e only. + +If partial newline-sensitive matching is specified, this affects "." and +bracket expressions as with newline-sensitive matching, but not "^" and "$". + +If inverse partial newline-sensitive matching is specified, this affects "^" +and "$" as with newline-sensitive matching, but not "." and bracket expressions. This isn't very useful but is provided for symmetry. @section overview_resyntax_limits Limits and Compatibility -No particular limit is imposed on the length of REs. Programs -intended to be highly portable should not employ REs longer than 256 bytes, -as a POSIX-compliant implementation can refuse to accept such REs. -The only -feature of AREs that is actually incompatible with POSIX EREs is that @b \ -does not lose its special significance inside bracket expressions. All other -ARE features use syntax which is illegal or has undefined or unspecified -effects in POSIX EREs; the @b *** syntax of directors likewise is outside -the POSIX syntax for both BREs and EREs. -Many of the ARE extensions are -borrowed from Perl, but some have been changed to clean them up, and a -few Perl extensions are not present. Incompatibilities of note include '@b \b', -'@b \B', the lack of special treatment for a trailing newline, the addition of -complemented bracket expressions to the things affected by newline-sensitive -matching, the restrictions on parentheses and back references in lookahead -constraints, and the longest/shortest-match (rather than first-match) matching -semantics. -The matching rules for REs containing both normal and non-greedy -quantifiers have changed since early beta-test versions of this package. -(The new rules are much simpler and cleaner, but don't work as hard at guessing -the user's real intentions.) +No particular limit is imposed on the length of REs. Programs intended to be +highly portable should not employ REs longer than 256 bytes, as a +POSIX-compliant implementation can refuse to accept such REs. + +The only feature of AREs that is actually incompatible with POSIX EREs is that +@\ does not lose its special significance inside bracket expressions. +All other ARE features use syntax which is illegal or has undefined or +unspecified effects in POSIX EREs; the *** syntax of directors +likewise is outside the POSIX syntax for both BREs and EREs. + +Many of the ARE extensions are borrowed from Perl, but some have been changed +to clean them up, and a few Perl extensions are not present. Incompatibilities +of note include @\b, @\B, the lack of special treatment for a +trailing newline, the addition of complemented bracket expressions to the +things affected by newline-sensitive matching, the restrictions on parentheses +and back references in lookahead constraints, and the longest/shortest-match +(rather than first-match) matching semantics. + +The matching rules for REs containing both normal and non-greedy quantifiers +have changed since early beta-test versions of this package. The new rules are +much simpler and cleaner, but don't work as hard at guessing the user's real +intentions. + Henry Spencer's original 1986 @e regexp package, still in widespread use, -implemented an early version of today's EREs. There are four incompatibilities between @e regexp's -near-EREs ('RREs' for short) and AREs. In roughly increasing order of significance: - -In AREs, @b \ followed by an alphanumeric character is either an escape or -an error, while in RREs, it was just another way of writing the alphanumeric. -This should not be a problem because there was no reason to write such -a sequence in RREs. -@b { followed by a digit in an ARE is the beginning of -a bound, while in RREs, @b { was always an ordinary character. Such sequences -should be rare, and will often result in an error because following characters -will not look like a valid bound. -In AREs, @b \ remains a special character -within '@b []', so a literal @b \ within @b [] must be -written '@b \\'. @b \\ also gives a literal -@b \ within @b [] in RREs, but only truly paranoid programmers routinely doubled -the backslash. -AREs report the longest/shortest match for the RE, rather -than the first found in a specified search order. This may affect some RREs -which were written in the expectation that the first match would be reported. -(The careful crafting of RREs to optimize the search order for fast matching -is obsolete (AREs examine all possible matches in parallel, and their performance -is largely insensitive to their complexity) but cases where the search -order was exploited to deliberately find a match which was @e not the longest/shortest -will need rewriting.) +implemented an early version of today's EREs. There are four incompatibilities +between @e regexp's near-EREs (RREs for short) and AREs. In roughly increasing +order of significance: + +@li In AREs, @\ followed by an alphanumeric character is either an + escape or an error, while in RREs, it was just another way of writing the + alphanumeric. This should not be a problem because there was no reason to + write such a sequence in RREs. +@li @c { followed by a digit in an ARE is the beginning of a bound, while in + RREs, @c { was always an ordinary character. Such sequences should be rare, + and will often result in an error because following characters will not + look like a valid bound. +@li In AREs, @c @\ remains a special character within @c [], so a literal @c @\ + within @c [] must be written as @\@\. @\@\ also gives a + literal @c @\ within @c [] in RREs, but only truly paranoid programmers + routinely doubled the backslash. +@li AREs report the longest/shortest match for the RE, rather than the first + found in a specified search order. This may affect some RREs which were + written in the expectation that the first match would be reported. The + careful crafting of RREs to optimize the search order for fast matching is + obsolete (AREs examine all possible matches in parallel, and their + performance is largely insensitive to their complexity) but cases where the + search order was exploited to deliberately find a match which was @e not + the longest/shortest will need rewriting. @section overview_resyntax_bre Basic Regular Expressions -BREs differ from EREs in -several respects. '@b |', '@b +', and @b ? are ordinary characters and there is no equivalent -for their functionality. The delimiters for bounds -are @b \{ and '@b \}', with @b { and -@b } by themselves ordinary characters. The parentheses for nested subexpressions -are @b \( and '@b \)', with @b ( and @b ) by themselves -ordinary characters. @b ^ is an ordinary +BREs differ from EREs in several respects. @c |, @c +, and @c ? are ordinary +characters and there is no equivalent for their functionality. The delimiters +for bounds are @c @\{ and @c @\}, with @c { and @c } by themselves ordinary +characters. The parentheses for nested subexpressions are @c @\( and @c @\), +with @c ( and @c ) by themselves ordinary characters. @c ^ is an ordinary character except at the beginning of the RE or the beginning of a parenthesized -subexpression, @b $ is an ordinary character except at the end of the RE or -the end of a parenthesized subexpression, and @b * is an ordinary character -if it appears at the beginning of the RE or the beginning of a parenthesized -subexpression (after a possible leading '@b ^'). Finally, single-digit back references -are available, and @b \ and @b \ are synonyms -for @b [[::]] and @b [[::]] respectively; -no other escapes are available. +subexpression, @c $ is an ordinary character except at the end of the RE or the +end of a parenthesized subexpression, and @c * is an ordinary character if it +appears at the beginning of the RE or the beginning of a parenthesized +subexpression (after a possible leading ^). Finally, single-digit back +references are available, and @c @\@< and @c @\@> are synonyms for +[[:@<:]] and [[:@>:]] respectively; no other escapes are +available. @section overview_resyntax_characters Regular Expression Character Names Note that the character names are case sensitive. +
+ + + + -NUL - - - - -'\0' - - - - - -SOH - - - - -'\001' - - - - - -STX - - - - -'\002' - - - - - -ETX - - - - -'\003' - - - - - -EOT - - - - -'\004' - - - - - -ENQ - - - - -'\005' - - - - - -ACK - - - - -'\006' - - - - - -BEL - - - - -'\007' - - - - - -alert - - - - -'\007' - - - - - -BS - - - - -'\010' - - - - - -backspace - - - - -'\b' - - - - - -HT - - - - -'\011' - - - - - -tab - - - - -'\t' - - - - - -LF - - - - -'\012' - - - - - -newline - - - - -'\n' - - - - - -VT - - - - -'\013' - - - - - -vertical-tab - - - - -'\v' - - - - - -FF - - - - -'\014' - - - - - -form-feed - - - - -'\f' - - - - - -CR - - - - -'\015' - - - - - -carriage-return - - - - -'\r' - - - - - -SO - - - - -'\016' - - - - - -SI - - - - -'\017' - - - - - -DLE - - - - -'\020' - - - - - -DC1 - - - - -'\021' - - - - - -DC2 - - - - -'\022' - - - - - -DC3 - - - - -'\023' - - - - - -DC4 - - - - -'\024' - - - - - -NAK - - - - -'\025' - - - - - -SYN - - - - -'\026' - - - - - -ETB - - - - -'\027' - - - - - -CAN - - - - -'\030' - - - - - -EM - - - - -'\031' - - - - - -SUB - - - - -'\032' - - - - - -ESC - - - - -'\033' - - - - - -IS4 - - - - -'\034' - - - - - -FS - - - - -'\034' - - - - - -IS3 - - - - -'\035' - - - - - -GS - - - - -'\035' - - - - - -IS2 - - - - -'\036' - - - - - -RS - - - - -'\036' - - - - - -IS1 - - - - -'\037' - - - - - -US - - - - -'\037' - - - - - -space - - - - -' ' - - - - - -exclamation-mark - - - - -'!' - - - - - -quotation-mark - - - - -'"' - - - - - -number-sign - - - - -'#' - - - - - -dollar-sign - - - - -'$' - - - - - -percent-sign - - - - -'%' - - - - - -ampersand - - - - -'' - - - - - -apostrophe - - - - -'\'' - - - - - -left-parenthesis - - - - -'(' - - - - - -right-parenthesis - - - - -')' - - - - - -asterisk - - - - -'*' - - - - - -plus-sign - - - - -'+' - - - - - -comma - - - - -',' - - - - - -hyphen - - - - -'-' - - - - - -hyphen-minus - - - - -'-' - - - - - -period - - - - -'.' - - - - - -full-stop - - - - -'.' - - - - - -slash - - - - -'/' - - - - - -solidus - - - - -'/' - - - - - -zero - - - - -'0' - - - - - -one - - - - -'1' - - - - - -two - - - - -'2' - - - - - -three - - - - -'3' - - - - - -four - - - - -'4' - - - - - -five - - - - -'5' - - - - - -six - - - - -'6' - - - - - -seven - - - - -'7' - - - - - -eight - - - - -'8' - - - - - -nine - - - - -'9' - - - - - -colon - - - - -':' - - - - - -semicolon - - - - -';' - - - - - -less-than-sign - - - - -'' - - - - - -equals-sign - - - - -'=' - - - - - -greater-than-sign - - - - -'' - - - - - -question-mark - - - - -'?' - - - - - -commercial-at - - - - -'@' - - - - - -left-square-bracket - - - - -'[' - - - - - -backslash - - - - -'\' - - - - - -reverse-solidus - - - - -'\' - - - - - -right-square-bracket - - - - -']' - - - - - -circumflex - - - - -'^' - - - - - -circumflex-accent - - - - -'^' - - - - - -underscore - - - - -'_' - - - - - -low-line - - - - -'_' - - - - - -grave-accent - - - - -''' - - - - - -left-brace - - - - -'{' - - - - - -left-curly-bracket - - - - -'{' - - - - - -vertical-line - - - - -'|' - - - - - -right-brace - - - - -'}' - - - - - -right-curly-bracket - - - - -'}' - - - - - -tilde - - - - -'~' - - - - - -DEL - - - + -'\177' +
+@beginTable +@row2col{ NUL , @\0 } +@row2col{ SOH , @\001 } +@row2col{ STX , @\002 } +@row2col{ ETX , @\003 } +@row2col{ EOT , @\004 } +@row2col{ ENQ , @\005 } +@row2col{ ACK , @\006 } +@row2col{ BEL , @\007 } +@row2col{ alert , @\007 } +@row2col{ BS , @\010 } +@row2col{ backspace , @\b } +@row2col{ HT , @\011 } +@row2col{ tab , @\t } +@row2col{ LF , @\012 } +@row2col{ newline , @\n } +@row2col{ VT , @\013 } +@row2col{ vertical-tab , @\v } +@row2col{ FF , @\014 } +@row2col{ form-feed , @\f } +@endTable + +@beginTable +@row2col{ CR , @\015 } +@row2col{ carriage-return , @\r } +@row2col{ SO , @\016 } +@row2col{ SI , @\017 } +@row2col{ DLE , @\020 } +@row2col{ DC1 , @\021 } +@row2col{ DC2 , @\022 } +@row2col{ DC3 , @\023 } +@row2col{ DC4 , @\024 } +@row2col{ NAK , @\025 } +@row2col{ SYN , @\026 } +@row2col{ ETB , @\027 } +@row2col{ CAN , @\030 } +@row2col{ EM , @\031 } +@row2col{ SUB , @\032 } +@row2col{ ESC , @\033 } +@row2col{ IS4 , @\034 } +@row2col{ FS , @\034 } +@row2col{ IS3 , @\035 } +@endTable + +@beginTable +@row2col{ GS , @\035 } +@row2col{ IS2 , @\036 } +@row2col{ RS , @\036 } +@row2col{ IS1 , @\037 } +@row2col{ US , @\037 } +@row2col{ space , " " (space) } +@row2col{ exclamation-mark , ! } +@row2col{ quotation-mark , " } +@row2col{ number-sign , @# } +@row2col{ dollar-sign , @$ } +@row2col{ percent-sign , @% } +@row2col{ ampersand , @& } +@row2col{ apostrophe , ' } +@row2col{ left-parenthesis , ( } +@row2col{ right-parenthesis , ) } +@row2col{ asterisk , * } +@row2col{ plus-sign , + } +@row2col{ comma , \, } +@row2col{ hyphen , - } +@endTable + +@beginTable +@row2col{ hyphen-minus , - } +@row2col{ period , . } +@row2col{ full-stop , . } +@row2col{ slash , / } +@row2col{ solidus , / } +@row2col{ zero , 0 } +@row2col{ one , 1 } +@row2col{ two , 2 } +@row2col{ three , 3 } +@row2col{ four , 4 } +@row2col{ five , 5 } +@row2col{ six , 6 } +@row2col{ seven , 7 } +@row2col{ eight , 8 } +@row2col{ nine , 9 } +@row2col{ colon , : } +@row2col{ semicolon , ; } +@row2col{ less-than-sign , @< } +@row2col{ equals-sign , = } +@endTable + +@beginTable +@row2col{ greater-than-sign , @> } +@row2col{ question-mark , ? } +@row2col{ commercial-at , @@ } +@row2col{ left-square-bracket , [ } +@row2col{ backslash , @\ } +@row2col{ reverse-solidus , @\ } +@row2col{ right-square-bracket , ] } +@row2col{ circumflex , ^ } +@row2col{ circumflex-accent , ^ } +@row2col{ underscore , _ } +@row2col{ low-line , _ } +@row2col{ grave-accent , ' } +@row2col{ left-brace , @leftCurly } +@row2col{ left-curly-bracket , @leftCurly } +@row2col{ vertical-line , | } +@row2col{ right-brace , @rightCurly } +@row2col{ right-curly-bracket , @rightCurly } +@row2col{ tilde , ~ } +@row2col{ DEL , @\177 } +@endTable +
*/